569 files changed, 34310 insertions, 7852 deletions
diff --git a/.pylintrc b/.pylintrc
index 30b8ae1659f8..89eaf2100edd 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,2 +1,2 @@
 [MASTER]
-init-hook='import sys; sys.path += ["scripts/lib/kdoc", "scripts/lib/abi"]'
+init-hook='import sys; sys.path += ["scripts/lib/kdoc", "scripts/lib/abi", "tools/docs/lib"]'
diff --git a/Documentation/.renames.txt b/Documentation/.renames.txt
new file mode 100644
index 000000000000..c0bd5d3dc8b9
--- /dev/null
+++ b/Documentation/.renames.txt
@@ -0,0 +1,1191 @@
+80211/cfg80211 driver-api/80211/cfg80211
+80211/index driver-api/80211/index
+80211/introduction driver-api/80211/introduction
+80211/mac80211 driver-api/80211/mac80211
+80211/mac80211-advanced driver-api/80211/mac80211-advanced
+EDID/howto admin-guide/edid
+PCI/picebus-howto PCI/pciebus-howto
+RAS/address-translation admin-guide/RAS/address-translation
+RAS/error-decoding admin-guide/RAS/error-decoding
+RAS/ras admin-guide/RAS/error-decoding
+accelerators/ocxl userspace-api/accelerators/ocxl
+admin-guide/gpio/sysfs userspace-api/gpio/sysfs
+admin-guide/l1tf admin-guide/hw-vuln/l1tf
+admin-guide/media/v4l-with-ir admin-guide/media/remote-controller
+admin-guide/ras admin-guide/RAS/main
+admin-guide/security-bugs process/security-bugs
+aoe/aoe admin-guide/aoe/aoe
+aoe/examples admin-guide/aoe/examples
+aoe/index admin-guide/aoe/index
+aoe/todo admin-guide/aoe/todo
+arc/arc arch/arc/arc
+arc/features arch/arc/features
+arc/index arch/arc/index
+arch/x86/resctrl filesystems/resctrl
+arm/arm arch/arm/arm
+arm/booting arch/arm/booting
+arm/cluster-pm-race-avoidance arch/arm/cluster-pm-race-avoidance
+arm/features arch/arm/features
+arm/firmware arch/arm/firmware
+arm/google/chromebook-boot-flow arch/arm/google/chromebook-boot-flow
+arm/index arch/arm/index
+arm/interrupts arch/arm/interrupts
+arm/ixp4xx arch/arm/ixp4xx
+arm/kernel_mode_neon arch/arm/kernel_mode_neon
+arm/kernel_user_helpers arch/arm/kernel_user_helpers
+arm/keystone/knav-qmss arch/arm/keystone/knav-qmss
+arm/keystone/overview arch/arm/keystone/overview
+arm/marvel arch/arm/marvell
+arm/marvell arch/arm/marvell
+arm/mem_alignment arch/arm/mem_alignment
+arm/memory arch/arm/memory
+arm/microchip arch/arm/microchip
+arm/netwinder arch/arm/netwinder
+arm/nwfpe/index arch/arm/nwfpe/index
+arm/nwfpe/netwinder-fpe arch/arm/nwfpe/netwinder-fpe
+arm/nwfpe/notes arch/arm/nwfpe/notes
+arm/nwfpe/nwfpe arch/arm/nwfpe/nwfpe
+arm/nwfpe/todo arch/arm/nwfpe/todo
+arm/omap/dss arch/arm/omap/dss
+arm/omap/index arch/arm/omap/index
+arm/omap/omap arch/arm/omap/omap
+arm/omap/omap_pm arch/arm/omap/omap_pm
+arm/porting arch/arm/porting
+arm/pxa/mfp arch/arm/pxa/mfp
+arm/sa1100/assabet arch/arm/sa1100/assabet
+arm/sa1100/cerf arch/arm/sa1100/cerf
+arm/sa1100/index arch/arm/sa1100/index
+arm/sa1100/lart arch/arm/sa1100/lart
+arm/sa1100/serial_uart arch/arm/sa1100/serial_uart
+arm/samsung/bootloader-interface arch/arm/samsung/bootloader-interface
+arm/samsung/gpio arch/arm/samsung/gpio
+arm/samsung/index arch/arm/samsung/index
+arm/samsung/overview arch/arm/samsung/overview
+arm/setup arch/arm/setup
+arm/spear/overview arch/arm/spear/overview
+arm/sti/overview arch/arm/sti/overview
+arm/sti/stih407-overview arch/arm/sti/stih407-overview
+arm/sti/stih418-overview arch/arm/sti/stih418-overview
+arm/stm32/overview arch/arm/stm32/overview
+arm/stm32/stm32-dma-mdma-chaining arch/arm/stm32/stm32-dma-mdma-chaining
+arm/stm32/stm32f429-overview arch/arm/stm32/stm32f429-overview
+arm/stm32/stm32f746-overview arch/arm/stm32/stm32f746-overview
+arm/stm32/stm32f769-overview arch/arm/stm32/stm32f769-overview
+arm/stm32/stm32h743-overview arch/arm/stm32/stm32h743-overview
+arm/stm32/stm32h750-overview arch/arm/stm32/stm32h750-overview
+arm/stm32/stm32mp13-overview arch/arm/stm32/stm32mp13-overview
+arm/stm32/stm32mp151-overview arch/arm/stm32/stm32mp151-overview
+arm/stm32/stm32mp157-overview arch/arm/stm32/stm32mp157-overview
+arm/sunxi arch/arm/sunxi
+arm/sunxi/clocks arch/arm/sunxi/clocks
+arm/swp_emulation arch/arm/swp_emulation
+arm/tcm arch/arm/tcm
+arm/uefi arch/arm/uefi
+arm/vfp/release-notes arch/arm/vfp/release-notes
+arm/vlocks arch/arm/vlocks
+arm64/acpi_object_usage arch/arm64/acpi_object_usage
+arm64/amu arch/arm64/amu
+arm64/arm-acpi arch/arm64/arm-acpi
+arm64/asymmetric-32bit arch/arm64/asymmetric-32bit
+arm64/booting arch/arm64/booting
+arm64/cpu-feature-registers arch/arm64/cpu-feature-registers
+arm64/elf_hwcaps arch/arm64/elf_hwcaps
+arm64/features arch/arm64/features
+arm64/hugetlbpage arch/arm64/hugetlbpage
+arm64/index arch/arm64/index
+arm64/legacy_instructions arch/arm64/legacy_instructions
+arm64/memory arch/arm64/memory
+arm64/memory-tagging-extension arch/arm64/memory-tagging-extension
+arm64/perf arch/arm64/perf
+arm64/pointer-authentication arch/arm64/pointer-authentication
+arm64/silicon-errata arch/arm64/silicon-errata
+arm64/sme arch/arm64/sme
+arm64/sve arch/arm64/sve
+arm64/tagged-address-abi arch/arm64/tagged-address-abi
+arm64/tagged-pointers arch/arm64/tagged-pointers
+asm-annotations core-api/asm-annotations
+auxdisplay/lcd-panel-cgram admin-guide/lcd-panel-cgram
+backlight/lp855x-driver driver-api/backlight/lp855x-driver
+blockdev/drbd/data-structure-v9 admin-guide/blockdev/drbd/data-structure-v9
+blockdev/drbd/figures admin-guide/blockdev/drbd/figures
+blockdev/drbd/index admin-guide/blockdev/drbd/index
+blockdev/floppy admin-guide/blockdev/floppy
+blockdev/index admin-guide/blockdev/index
+blockdev/nbd admin-guide/blockdev/nbd
+blockdev/paride admin-guide/blockdev/paride
+blockdev/ramdisk admin-guide/blockdev/ramdisk
+blockdev/zram admin-guide/blockdev/zram
+bpf/README bpf/index
+bpf/bpf_lsm bpf/prog_lsm
+bpf/instruction-set bpf/standardization/instruction-set
+bpf/libbpf/libbpf bpf/libbpf/index
+bpf/standardization/linux-notes bpf/linux-notes
+bus-devices/ti-gpmc driver-api/memory-devices/ti-gpmc
+cgroup-v1/blkio-controller admin-guide/cgroup-v1/blkio-controller
+cgroup-v1/cgroups admin-guide/cgroup-v1/cgroups
+cgroup-v1/cpuacct admin-guide/cgroup-v1/cpuacct
+cgroup-v1/cpusets admin-guide/cgroup-v1/cpusets
+cgroup-v1/devices admin-guide/cgroup-v1/devices
+cgroup-v1/freezer-subsystem admin-guide/cgroup-v1/freezer-subsystem
+cgroup-v1/hugetlb admin-guide/cgroup-v1/hugetlb
+cgroup-v1/index admin-guide/cgroup-v1/index
+cgroup-v1/memcg_test admin-guide/cgroup-v1/memcg_test
+cgroup-v1/memory admin-guide/cgroup-v1/memory
+cgroup-v1/net_cls admin-guide/cgroup-v1/net_cls
+cgroup-v1/net_prio admin-guide/cgroup-v1/net_prio
+cgroup-v1/pids admin-guide/cgroup-v1/pids
+cgroup-v1/rdma admin-guide/cgroup-v1/rdma
+cma/debugfs admin-guide/mm/cma_debugfs
+connector/connector driver-api/connector
+console/console driver-api/console
+core-api/gcc-plugins kbuild/gcc-plugins
+core-api/ioctl driver-api/ioctl
+core-api/memory-hotplug-notifier core-api/memory-hotplug
+dev-tools/gdb-kernel-debugging process/debugging/gdb-kernel-debugging
+dev-tools/kgdb process/debugging/kgdb
+dev-tools/tools dev-tools/index
+development-process/1.Intro process/1.Intro
+development-process/2.Process process/2.Process
+development-process/3.Early-stage process/3.Early-stage
+development-process/4.Coding process/4.Coding
+development-process/5.Posting process/5.Posting
+development-process/6.Followthrough process/6.Followthrough
+development-process/7.AdvancedTopics process/7.AdvancedTopics
+development-process/8.Conclusion process/8.Conclusion
+development-process/development-process process/development-process
+development-process/index process/index
+device-mapper/cache admin-guide/device-mapper/cache
+device-mapper/cache-policies admin-guide/device-mapper/cache-policies
+device-mapper/delay admin-guide/device-mapper/delay
+device-mapper/dm-crypt admin-guide/device-mapper/dm-crypt
+device-mapper/dm-flakey admin-guide/device-mapper/dm-flakey
+device-mapper/dm-init admin-guide/device-mapper/dm-init
+device-mapper/dm-integrity admin-guide/device-mapper/dm-integrity
+device-mapper/dm-io admin-guide/device-mapper/dm-io
+device-mapper/dm-log admin-guide/device-mapper/dm-log
+device-mapper/dm-queue-length admin-guide/device-mapper/dm-queue-length
+device-mapper/dm-raid admin-guide/device-mapper/dm-raid
+device-mapper/dm-service-time admin-guide/device-mapper/dm-service-time
+device-mapper/dm-uevent admin-guide/device-mapper/dm-uevent
+device-mapper/dm-zoned admin-guide/device-mapper/dm-zoned
+device-mapper/era admin-guide/device-mapper/era
+device-mapper/index admin-guide/device-mapper/index
+device-mapper/kcopyd admin-guide/device-mapper/kcopyd
+device-mapper/linear admin-guide/device-mapper/linear
+device-mapper/log-writes admin-guide/device-mapper/log-writes
+device-mapper/persistent-data admin-guide/device-mapper/persistent-data
+device-mapper/snapshot admin-guide/device-mapper/snapshot
+device-mapper/statistics admin-guide/device-mapper/statistics
+device-mapper/striped admin-guide/device-mapper/striped
+device-mapper/switch admin-guide/device-mapper/switch
+device-mapper/thin-provisioning admin-guide/device-mapper/thin-provisioning
+device-mapper/unstriped admin-guide/device-mapper/unstriped
+device-mapper/verity admin-guide/device-mapper/verity
+device-mapper/writecache admin-guide/device-mapper/writecache
+device-mapper/zero admin-guide/device-mapper/zero
+devicetree/writing-schema devicetree/bindings/writing-schema
+driver-api/bt8xxgpio driver-api/gpio/bt8xxgpio
+driver-api/cxl/access-coordinates driver-api/cxl/linux/access-coordinates
+driver-api/cxl/memory-devices driver-api/cxl/theory-of-operation
+driver-api/dcdbas userspace-api/dcdbas
+driver-api/dell_rbu admin-guide/dell_rbu
+driver-api/edid admin-guide/edid
+driver-api/gpio driver-api/gpio/index
+driver-api/hte/tegra194-hte driver-api/hte/tegra-hte
+driver-api/isapnp userspace-api/isapnp
+driver-api/media/drivers/v4l-drivers/zoran driver-api/media/drivers/zoran
+driver-api/mtd/intel-spi driver-api/mtd/spi-intel
+driver-api/pci driver-api/pci/pci
+driver-api/pinctl driver-api/pin-control
+driver-api/rapidio admin-guide/rapidio
+driver-api/serial/moxa-smartio driver-api/tty/moxa-smartio
+driver-api/serial/n_gsm driver-api/tty/n_gsm
+driver-api/serial/tty driver-api/tty/tty_ldisc
+driver-api/thermal/intel_powerclamp admin-guide/thermal/intel_powerclamp
+driver-api/usb driver-api/usb/usb
+driver-model/binding driver-api/driver-model/binding
+driver-model/bus driver-api/driver-model/bus
+driver-model/design-patterns driver-api/driver-model/design-patterns
+driver-model/device driver-api/driver-model/device
+driver-model/devres driver-api/driver-model/devres
+driver-model/driver driver-api/driver-model/driver
+driver-model/index driver-api/driver-model/index
+driver-model/overview driver-api/driver-model/overview
+driver-model/platform driver-api/driver-model/platform
+driver-model/porting driver-api/driver-model/porting
+early-userspace/buffer-format driver-api/early-userspace/buffer-format
+early-userspace/early_userspace_support driver-api/early-userspace/early_userspace_support
+early-userspace/index driver-api/early-userspace/index
+errseq core-api/errseq
+filesystems/binderfs admin-guide/binderfs
+filesystems/cifs/cifsd filesystems/smb/ksmbd
+filesystems/cifs/cifsroot filesystems/smb/cifsroot
+filesystems/cifs/index filesystems/smb/index
+filesystems/cifs/ksmbd filesystems/smb/ksmbd
+filesystems/ext4/ext4 admin-guide/ext4
+filesystems/ext4/ondisk/about filesystems/ext4/about
+filesystems/ext4/ondisk/allocators filesystems/ext4/allocators
+filesystems/ext4/ondisk/attributes filesystems/ext4/attributes
+filesystems/ext4/ondisk/bigalloc filesystems/ext4/bigalloc
+filesystems/ext4/ondisk/bitmaps filesystems/ext4/bitmaps
+filesystems/ext4/ondisk/blockgroup filesystems/ext4/blockgroup
+filesystems/ext4/ondisk/blockmap filesystems/ext4/blockmap
+filesystems/ext4/ondisk/blocks filesystems/ext4/blocks
+filesystems/ext4/ondisk/checksums filesystems/ext4/checksums
+filesystems/ext4/ondisk/directory filesystems/ext4/directory
+filesystems/ext4/ondisk/dynamic filesystems/ext4/dynamic
+filesystems/ext4/ondisk/eainode filesystems/ext4/eainode
+filesystems/ext4/ondisk/globals filesystems/ext4/globals
+filesystems/ext4/ondisk/group_descr filesystems/ext4/group_descr
+filesystems/ext4/ondisk/ifork filesystems/ext4/ifork
+filesystems/ext4/ondisk/inlinedata filesystems/ext4/inlinedata
+filesystems/ext4/ondisk/inodes filesystems/ext4/inodes
+filesystems/ext4/ondisk/journal filesystems/ext4/journal
+filesystems/ext4/ondisk/mmp filesystems/ext4/mmp
+filesystems/ext4/ondisk/overview filesystems/ext4/overview
+filesystems/ext4/ondisk/special_inodes filesystems/ext4/special_inodes
+filesystems/ext4/ondisk/super filesystems/ext4/super
+filesystems/sysfs-pci PCI/sysfs-pci
+filesystems/sysfs-tagging networking/sysfs-tagging
+filesystems/xfs-delayed-logging-design filesystems/xfs/xfs-delayed-logging-design
+filesystems/xfs-maintainer-entry-profile filesystems/xfs/xfs-maintainer-entry-profile
+filesystems/xfs-online-fsck-design filesystems/xfs/xfs-online-fsck-design
+filesystems/xfs-self-describing-metadata filesystems/xfs/xfs-self-describing-metadata
+gpio/index admin-guide/gpio/index
+gpio/sysfs userspace-api/gpio/sysfs
+gpu/amdgpu gpu/amdgpu/index
+hte/hte driver-api/hte/hte
+hte/index driver-api/hte/index
+hte/tegra194-hte driver-api/hte/tegra-hte
+input/alps input/devices/alps
+input/amijoy input/devices/amijoy
+input/appletouch input/devices/appletouch
+input/atarikbd input/devices/atarikbd
+input/bcm5974 input/devices/bcm5974
+input/cma3000_d0x input/devices/cma3000_d0x
+input/cs461x input/devices/cs461x
+input/edt-ft5x06 input/devices/edt-ft5x06
+input/elantech input/devices/elantech
+input/iforce-protocol input/devices/iforce-protocol
+input/joystick input/joydev/joystick
+input/joystick-api input/joydev/joystick-api
+input/joystick-parport input/devices/joystick-parport
+input/ntrig input/devices/ntrig
+input/rotary-encoder input/devices/rotary-encoder
+input/sentelic input/devices/sentelic
+input/walkera0701 input/devices/walkera0701
+input/xpad input/devices/xpad
+input/yealink input/devices/yealink
+interconnect/interconnect driver-api/interconnect
+ioctl/botching-up-ioctls process/botching-up-ioctls
+ioctl/cdrom userspace-api/ioctl/cdrom
+ioctl/hdio userspace-api/ioctl/hdio
+ioctl/index userspace-api/ioctl/index
+ioctl/ioctl-decoding userspace-api/ioctl/ioctl-decoding
+ioctl/ioctl-number userspace-api/ioctl/ioctl-number
+kbuild/namespaces core-api/symbol-namespaces
+kdump/index admin-guide/kdump/index
+kdump/kdump admin-guide/kdump/kdump
+kdump/vmcoreinfo admin-guide/kdump/vmcoreinfo
+kernel-documentation doc-guide/kernel-doc
+laptops/asus-laptop admin-guide/laptops/asus-laptop
+laptops/disk-shock-protection admin-guide/laptops/disk-shock-protection
+laptops/index admin-guide/laptops/index
+laptops/laptop-mode admin-guide/laptops/laptop-mode
+laptops/lg-laptop admin-guide/laptops/lg-laptop
+laptops/sony-laptop admin-guide/laptops/sony-laptop
+laptops/sonypi admin-guide/laptops/sonypi
+laptops/thinkpad-acpi admin-guide/laptops/thinkpad-acpi
+laptops/toshiba_haps admin-guide/laptops/toshiba_haps
+loongarch/booting arch/loongarch/booting
+loongarch/features arch/loongarch/features
+loongarch/index arch/loongarch/index
+loongarch/introduction arch/loongarch/introduction
+loongarch/irq-chip-model arch/loongarch/irq-chip-model
+m68k/buddha-driver arch/m68k/buddha-driver
+m68k/features arch/m68k/features
+m68k/index arch/m68k/index
+m68k/kernel-options arch/m68k/kernel-options
+md/index driver-api/md/index
+md/md-cluster driver-api/md/md-cluster
+md/raid5-cache driver-api/md/raid5-cache
+md/raid5-ppl driver-api/md/raid5-ppl
+media/dvb-drivers/avermedia admin-guide/media/avermedia
+media/dvb-drivers/bt8xx admin-guide/media/bt8xx
+media/dvb-drivers/ci admin-guide/media/ci
+media/dvb-drivers/contributors driver-api/media/drivers/contributors
+media/dvb-drivers/dvb-usb driver-api/media/drivers/dvb-usb
+media/dvb-drivers/faq admin-guide/media/faq
+media/dvb-drivers/frontends driver-api/media/drivers/frontends
+media/dvb-drivers/index driver-api/media/drivers/index
+media/dvb-drivers/lmedm04 admin-guide/media/lmedm04
+media/dvb-drivers/opera-firmware admin-guide/media/opera-firmware
+media/dvb-drivers/technisat admin-guide/media/technisat
+media/dvb-drivers/ttusb-dec admin-guide/media/ttusb-dec
+media/intro userspace-api/media/intro
+media/kapi/cec-core driver-api/media/cec-core
+media/kapi/dtv-ca driver-api/media/dtv-ca
+media/kapi/dtv-common driver-api/media/dtv-common
+media/kapi/dtv-core driver-api/media/dtv-core
+media/kapi/dtv-demux driver-api/media/dtv-demux
+media/kapi/dtv-frontend driver-api/media/dtv-frontend
+media/kapi/dtv-net driver-api/media/dtv-net
+media/kapi/mc-core driver-api/media/mc-core
+media/kapi/rc-core driver-api/media/rc-core
+media/kapi/v4l2-async driver-api/media/v4l2-async
+media/kapi/v4l2-common driver-api/media/v4l2-common
+media/kapi/v4l2-controls driver-api/media/v4l2-controls
+media/kapi/v4l2-core driver-api/media/v4l2-core
+media/kapi/v4l2-dev driver-api/media/v4l2-dev
+media/kapi/v4l2-device driver-api/media/v4l2-device
+media/kapi/v4l2-dv-timings driver-api/media/v4l2-dv-timings
+media/kapi/v4l2-event driver-api/media/v4l2-event
+media/kapi/v4l2-fh driver-api/media/v4l2-fh
+media/kapi/v4l2-flash-led-class driver-api/media/v4l2-flash-led-class
+media/kapi/v4l2-fwnode driver-api/media/v4l2-fwnode
+media/kapi/v4l2-intro driver-api/media/v4l2-intro
+media/kapi/v4l2-mc driver-api/media/v4l2-mc
+media/kapi/v4l2-mediabus driver-api/media/v4l2-mediabus
+media/kapi/v4l2-mem2mem driver-api/media/v4l2-mem2mem
+media/kapi/v4l2-rect driver-api/media/v4l2-rect
+media/kapi/v4l2-subdev driver-api/media/v4l2-subdev
+media/kapi/v4l2-tuner driver-api/media/v4l2-tuner
+media/kapi/v4l2-tveeprom driver-api/media/v4l2-tveeprom
+media/kapi/v4l2-videobuf2 driver-api/media/v4l2-videobuf2
+media/media_kapi driver-api/media/index
+media/media_uapi userspace-api/media/index
+media/uapi/cec/cec-api userspace-api/media/cec/cec-api
+media/uapi/cec/cec-func-close userspace-api/media/cec/cec-func-close
+media/uapi/cec/cec-func-ioctl userspace-api/media/cec/cec-func-ioctl
+media/uapi/cec/cec-func-open userspace-api/media/cec/cec-func-open
+media/uapi/cec/cec-func-poll userspace-api/media/cec/cec-func-poll
+media/uapi/cec/cec-funcs userspace-api/media/cec/cec-funcs
+media/uapi/cec/cec-header userspace-api/media/cec/cec-header
+media/uapi/cec/cec-intro userspace-api/media/cec/cec-intro
+media/uapi/cec/cec-ioc-adap-g-caps userspace-api/media/cec/cec-ioc-adap-g-caps
+media/uapi/cec/cec-ioc-adap-g-conn-info userspace-api/media/cec/cec-ioc-adap-g-conn-info
+media/uapi/cec/cec-ioc-adap-g-log-addrs userspace-api/media/cec/cec-ioc-adap-g-log-addrs
+media/uapi/cec/cec-ioc-adap-g-phys-addr userspace-api/media/cec/cec-ioc-adap-g-phys-addr
+media/uapi/cec/cec-ioc-dqevent userspace-api/media/cec/cec-ioc-dqevent
+media/uapi/cec/cec-ioc-g-mode userspace-api/media/cec/cec-ioc-g-mode
+media/uapi/cec/cec-ioc-receive userspace-api/media/cec/cec-ioc-receive
+media/uapi/cec/cec-pin-error-inj userspace-api/media/cec/cec-pin-error-inj
+media/uapi/dvb/ca userspace-api/media/dvb/ca
+media/uapi/dvb/ca-fclose userspace-api/media/dvb/ca-fclose
+media/uapi/dvb/ca-fopen userspace-api/media/dvb/ca-fopen
+media/uapi/dvb/ca-get-cap userspace-api/media/dvb/ca-get-cap
+media/uapi/dvb/ca-get-descr-info userspace-api/media/dvb/ca-get-descr-info
+media/uapi/dvb/ca-get-msg userspace-api/media/dvb/ca-get-msg
+media/uapi/dvb/ca-get-slot-info userspace-api/media/dvb/ca-get-slot-info
+media/uapi/dvb/ca-reset userspace-api/media/dvb/ca-reset
+media/uapi/dvb/ca-send-msg userspace-api/media/dvb/ca-send-msg
+media/uapi/dvb/ca-set-descr userspace-api/media/dvb/ca-set-descr
+media/uapi/dvb/ca_data_types userspace-api/media/dvb/ca_data_types
+media/uapi/dvb/ca_function_calls userspace-api/media/dvb/ca_function_calls
+media/uapi/dvb/ca_high_level userspace-api/media/dvb/ca_high_level
+media/uapi/dvb/demux userspace-api/media/dvb/demux
+media/uapi/dvb/dmx-add-pid userspace-api/media/dvb/dmx-add-pid
+media/uapi/dvb/dmx-expbuf userspace-api/media/dvb/dmx-expbuf
+media/uapi/dvb/dmx-fclose userspace-api/media/dvb/dmx-fclose
+media/uapi/dvb/dmx-fopen userspace-api/media/dvb/dmx-fopen
+media/uapi/dvb/dmx-fread userspace-api/media/dvb/dmx-fread
+media/uapi/dvb/dmx-fwrite userspace-api/media/dvb/dmx-fwrite
+media/uapi/dvb/dmx-get-pes-pids userspace-api/media/dvb/dmx-get-pes-pids
+media/uapi/dvb/dmx-get-stc userspace-api/media/dvb/dmx-get-stc
+media/uapi/dvb/dmx-mmap userspace-api/media/dvb/dmx-mmap
+media/uapi/dvb/dmx-munmap userspace-api/media/dvb/dmx-munmap
+media/uapi/dvb/dmx-qbuf userspace-api/media/dvb/dmx-qbuf
+media/uapi/dvb/dmx-querybuf userspace-api/media/dvb/dmx-querybuf
+media/uapi/dvb/dmx-remove-pid userspace-api/media/dvb/dmx-remove-pid
+media/uapi/dvb/dmx-reqbufs userspace-api/media/dvb/dmx-reqbufs
+media/uapi/dvb/dmx-set-buffer-size userspace-api/media/dvb/dmx-set-buffer-size
+media/uapi/dvb/dmx-set-filter userspace-api/media/dvb/dmx-set-filter
+media/uapi/dvb/dmx-set-pes-filter userspace-api/media/dvb/dmx-set-pes-filter
+media/uapi/dvb/dmx-start userspace-api/media/dvb/dmx-start
+media/uapi/dvb/dmx-stop userspace-api/media/dvb/dmx-stop
+media/uapi/dvb/dmx_fcalls userspace-api/media/dvb/dmx_fcalls
+media/uapi/dvb/dmx_types userspace-api/media/dvb/dmx_types
+media/uapi/dvb/dvb-fe-read-status userspace-api/media/dvb/dvb-fe-read-status
+media/uapi/dvb/dvb-frontend-event userspace-api/media/dvb/dvb-frontend-event
+media/uapi/dvb/dvb-frontend-parameters userspace-api/media/dvb/dvb-frontend-parameters
+media/uapi/dvb/dvbapi userspace-api/media/dvb/dvbapi
+media/uapi/dvb/dvbproperty userspace-api/media/dvb/dvbproperty
+media/uapi/dvb/examples userspace-api/media/dvb/examples
+media/uapi/dvb/fe-bandwidth-t userspace-api/media/dvb/fe-bandwidth-t
+media/uapi/dvb/fe-diseqc-recv-slave-reply userspace-api/media/dvb/fe-diseqc-recv-slave-reply
+media/uapi/dvb/fe-diseqc-reset-overload userspace-api/media/dvb/fe-diseqc-reset-overload
+media/uapi/dvb/fe-diseqc-send-burst userspace-api/media/dvb/fe-diseqc-send-burst
+media/uapi/dvb/fe-diseqc-send-master-cmd userspace-api/media/dvb/fe-diseqc-send-master-cmd
+media/uapi/dvb/fe-dishnetwork-send-legacy-cmd userspace-api/media/dvb/fe-dishnetwork-send-legacy-cmd
+media/uapi/dvb/fe-enable-high-lnb-voltage userspace-api/media/dvb/fe-enable-high-lnb-voltage
+media/uapi/dvb/fe-get-event userspace-api/media/dvb/fe-get-event
+media/uapi/dvb/fe-get-frontend userspace-api/media/dvb/fe-get-frontend
+media/uapi/dvb/fe-get-info userspace-api/media/dvb/fe-get-info
+media/uapi/dvb/fe-get-property userspace-api/media/dvb/fe-get-property
+media/uapi/dvb/fe-read-ber userspace-api/media/dvb/fe-read-ber
+media/uapi/dvb/fe-read-signal-strength userspace-api/media/dvb/fe-read-signal-strength
+media/uapi/dvb/fe-read-snr userspace-api/media/dvb/fe-read-snr
+media/uapi/dvb/fe-read-status userspace-api/media/dvb/fe-read-status
+media/uapi/dvb/fe-read-uncorrected-blocks userspace-api/media/dvb/fe-read-uncorrected-blocks
+media/uapi/dvb/fe-set-frontend userspace-api/media/dvb/fe-set-frontend
+media/uapi/dvb/fe-set-frontend-tune-mode userspace-api/media/dvb/fe-set-frontend-tune-mode
+media/uapi/dvb/fe-set-tone userspace-api/media/dvb/fe-set-tone
+media/uapi/dvb/fe-set-voltage userspace-api/media/dvb/fe-set-voltage
+media/uapi/dvb/fe-type-t userspace-api/media/dvb/fe-type-t
+media/uapi/dvb/fe_property_parameters userspace-api/media/dvb/fe_property_parameters
+media/uapi/dvb/frontend userspace-api/media/dvb/frontend
+media/uapi/dvb/frontend-header userspace-api/media/dvb/frontend-header
+media/uapi/dvb/frontend-property-cable-systems userspace-api/media/dvb/frontend-property-cable-systems
+media/uapi/dvb/frontend-property-satellite-systems userspace-api/media/dvb/frontend-property-satellite-systems
+media/uapi/dvb/frontend-property-terrestrial-systems userspace-api/media/dvb/frontend-property-terrestrial-systems
+media/uapi/dvb/frontend-stat-properties userspace-api/media/dvb/frontend-stat-properties
+media/uapi/dvb/frontend_f_close userspace-api/media/dvb/frontend_f_close
+media/uapi/dvb/frontend_f_open userspace-api/media/dvb/frontend_f_open
+media/uapi/dvb/frontend_fcalls userspace-api/media/dvb/frontend_fcalls
+media/uapi/dvb/frontend_legacy_api userspace-api/media/dvb/frontend_legacy_api
+media/uapi/dvb/frontend_legacy_dvbv3_api userspace-api/media/dvb/frontend_legacy_dvbv3_api
+media/uapi/dvb/headers userspace-api/media/dvb/headers
+media/uapi/dvb/intro userspace-api/media/dvb/intro
+media/uapi/dvb/legacy_dvb_apis userspace-api/media/dvb/legacy_dvb_apis
+media/uapi/dvb/net userspace-api/media/dvb/net
+media/uapi/dvb/net-add-if userspace-api/media/dvb/net-add-if
+media/uapi/dvb/net-get-if userspace-api/media/dvb/net-get-if
+media/uapi/dvb/net-remove-if userspace-api/media/dvb/net-remove-if
+media/uapi/dvb/net-types userspace-api/media/dvb/net-types
+media/uapi/dvb/query-dvb-frontend-info userspace-api/media/dvb/query-dvb-frontend-info
+media/uapi/fdl-appendix userspace-api/media/fdl-appendix
+media/uapi/gen-errors userspace-api/media/gen-errors
+media/uapi/mediactl/media-controller userspace-api/media/mediactl/media-controller
+media/uapi/mediactl/media-controller-intro userspace-api/media/mediactl/media-controller-intro
+media/uapi/mediactl/media-controller-model userspace-api/media/mediactl/media-controller-model
+media/uapi/mediactl/media-func-close userspace-api/media/mediactl/media-func-close
+media/uapi/mediactl/media-func-ioctl userspace-api/media/mediactl/media-func-ioctl
+media/uapi/mediactl/media-func-open userspace-api/media/mediactl/media-func-open
+media/uapi/mediactl/media-funcs userspace-api/media/mediactl/media-funcs
+media/uapi/mediactl/media-header userspace-api/media/mediactl/media-header
+media/uapi/mediactl/media-ioc-device-info userspace-api/media/mediactl/media-ioc-device-info
+media/uapi/mediactl/media-ioc-enum-entities userspace-api/media/mediactl/media-ioc-enum-entities
+media/uapi/mediactl/media-ioc-enum-links userspace-api/media/mediactl/media-ioc-enum-links
+media/uapi/mediactl/media-ioc-g-topology userspace-api/media/mediactl/media-ioc-g-topology
+media/uapi/mediactl/media-ioc-request-alloc userspace-api/media/mediactl/media-ioc-request-alloc
+media/uapi/mediactl/media-ioc-setup-link userspace-api/media/mediactl/media-ioc-setup-link
+media/uapi/mediactl/media-request-ioc-queue userspace-api/media/mediactl/media-request-ioc-queue
+media/uapi/mediactl/media-request-ioc-reinit userspace-api/media/mediactl/media-request-ioc-reinit
+media/uapi/mediactl/media-types userspace-api/media/mediactl/media-types
+media/uapi/mediactl/request-api userspace-api/media/mediactl/request-api
+media/uapi/mediactl/request-func-close userspace-api/media/mediactl/request-func-close
+media/uapi/mediactl/request-func-ioctl userspace-api/media/mediactl/request-func-ioctl
+media/uapi/mediactl/request-func-poll userspace-api/media/mediactl/request-func-poll
+media/uapi/rc/keytable.c userspace-api/media/rc/keytable.c
+media/uapi/rc/lirc-dev userspace-api/media/rc/lirc-dev
+media/uapi/rc/lirc-dev-intro userspace-api/media/rc/lirc-dev-intro
+media/uapi/rc/lirc-func userspace-api/media/rc/lirc-func
+media/uapi/rc/lirc-get-features userspace-api/media/rc/lirc-get-features
+media/uapi/rc/lirc-get-rec-mode userspace-api/media/rc/lirc-get-rec-mode
+media/uapi/rc/lirc-get-rec-resolution userspace-api/media/rc/lirc-get-rec-resolution
+media/uapi/rc/lirc-get-send-mode userspace-api/media/rc/lirc-get-send-mode
+media/uapi/rc/lirc-get-timeout userspace-api/media/rc/lirc-get-timeout
+media/uapi/rc/lirc-header userspace-api/media/rc/lirc-header
+media/uapi/rc/lirc-read userspace-api/media/rc/lirc-read
+media/uapi/rc/lirc-set-measure-carrier-mode userspace-api/media/rc/lirc-set-measure-carrier-mode
+media/uapi/rc/lirc-set-rec-carrier userspace-api/media/rc/lirc-set-rec-carrier
+media/uapi/rc/lirc-set-rec-carrier-range userspace-api/media/rc/lirc-set-rec-carrier-range
+media/uapi/rc/lirc-set-rec-timeout userspace-api/media/rc/lirc-set-rec-timeout
+media/uapi/rc/lirc-set-send-carrier userspace-api/media/rc/lirc-set-send-carrier
+media/uapi/rc/lirc-set-send-duty-cycle userspace-api/media/rc/lirc-set-send-duty-cycle
+media/uapi/rc/lirc-set-transmitter-mask userspace-api/media/rc/lirc-set-transmitter-mask
+media/uapi/rc/lirc-set-wideband-receiver userspace-api/media/rc/lirc-set-wideband-receiver
+media/uapi/rc/lirc-write userspace-api/media/rc/lirc-write
+media/uapi/rc/rc-intro userspace-api/media/rc/rc-intro
+media/uapi/rc/rc-protos userspace-api/media/rc/rc-protos
+media/uapi/rc/rc-sysfs-nodes userspace-api/media/rc/rc-sysfs-nodes
+media/uapi/rc/rc-table-change userspace-api/media/rc/rc-table-change
+media/uapi/rc/rc-tables userspace-api/media/rc/rc-tables
+media/uapi/rc/remote_controllers userspace-api/media/rc/remote_controllers
+media/uapi/v4l/app-pri userspace-api/media/v4l/app-pri
+media/uapi/v4l/audio userspace-api/media/v4l/audio
+media/uapi/v4l/biblio userspace-api/media/v4l/biblio
+media/uapi/v4l/buffer userspace-api/media/v4l/buffer
+media/uapi/v4l/capture-example userspace-api/media/v4l/capture-example
+media/uapi/v4l/capture.c userspace-api/media/v4l/capture.c
+media/uapi/v4l/colorspaces userspace-api/media/v4l/colorspaces
+media/uapi/v4l/colorspaces-defs userspace-api/media/v4l/colorspaces-defs
+media/uapi/v4l/colorspaces-details userspace-api/media/v4l/colorspaces-details
+media/uapi/v4l/common userspace-api/media/v4l/common
+media/uapi/v4l/common-defs userspace-api/media/v4l/common-defs
+media/uapi/v4l/compat userspace-api/media/v4l/compat
+media/uapi/v4l/control userspace-api/media/v4l/control
+media/uapi/v4l/crop userspace-api/media/v4l/crop
+media/uapi/v4l/depth-formats userspace-api/media/v4l/depth-formats
+media/uapi/v4l/dev-capture userspace-api/media/v4l/dev-capture
+media/uapi/v4l/dev-codec userspace-api/media/v4l/dev-mem2mem
+media/uapi/v4l/dev-decoder userspace-api/media/v4l/dev-decoder
+media/uapi/v4l/dev-event userspace-api/media/v4l/dev-event
+media/uapi/v4l/dev-mem2mem userspace-api/media/v4l/dev-mem2mem
+media/uapi/v4l/dev-meta userspace-api/media/v4l/dev-meta
+media/uapi/v4l/dev-osd userspace-api/media/v4l/dev-osd
+media/uapi/v4l/dev-output userspace-api/media/v4l/dev-output
+media/uapi/v4l/dev-overlay userspace-api/media/v4l/dev-overlay
+media/uapi/v4l/dev-radio userspace-api/media/v4l/dev-radio
+media/uapi/v4l/dev-raw-vbi userspace-api/media/v4l/dev-raw-vbi
+media/uapi/v4l/dev-rds userspace-api/media/v4l/dev-rds
+media/uapi/v4l/dev-sdr userspace-api/media/v4l/dev-sdr
+media/uapi/v4l/dev-sliced-vbi userspace-api/media/v4l/dev-sliced-vbi
+media/uapi/v4l/dev-stateless-decoder userspace-api/media/v4l/dev-stateless-decoder
+media/uapi/v4l/dev-subdev userspace-api/media/v4l/dev-subdev
+media/uapi/v4l/dev-touch userspace-api/media/v4l/dev-touch
+media/uapi/v4l/devices userspace-api/media/v4l/devices
+media/uapi/v4l/diff-v4l userspace-api/media/v4l/diff-v4l
+media/uapi/v4l/dmabuf userspace-api/media/v4l/dmabuf
+media/uapi/v4l/dv-timings userspace-api/media/v4l/dv-timings
+media/uapi/v4l/ext-ctrls-camera userspace-api/media/v4l/ext-ctrls-camera
+media/uapi/v4l/ext-ctrls-codec userspace-api/media/v4l/ext-ctrls-codec
+media/uapi/v4l/ext-ctrls-detect userspace-api/media/v4l/ext-ctrls-detect
+media/uapi/v4l/ext-ctrls-dv userspace-api/media/v4l/ext-ctrls-dv
+media/uapi/v4l/ext-ctrls-flash userspace-api/media/v4l/ext-ctrls-flash
+media/uapi/v4l/ext-ctrls-fm-rx userspace-api/media/v4l/ext-ctrls-fm-rx
+media/uapi/v4l/ext-ctrls-fm-tx userspace-api/media/v4l/ext-ctrls-fm-tx
+media/uapi/v4l/ext-ctrls-image-process userspace-api/media/v4l/ext-ctrls-image-process
+media/uapi/v4l/ext-ctrls-image-source userspace-api/media/v4l/ext-ctrls-image-source
+media/uapi/v4l/ext-ctrls-jpeg userspace-api/media/v4l/ext-ctrls-jpeg
+media/uapi/v4l/ext-ctrls-rf-tuner userspace-api/media/v4l/ext-ctrls-rf-tuner
+media/uapi/v4l/extended-controls userspace-api/media/v4l/extended-controls
+media/uapi/v4l/field-order userspace-api/media/v4l/field-order
+media/uapi/v4l/format userspace-api/media/v4l/format
+media/uapi/v4l/func-close userspace-api/media/v4l/func-close
+media/uapi/v4l/func-ioctl userspace-api/media/v4l/func-ioctl
+media/uapi/v4l/func-mmap userspace-api/media/v4l/func-mmap
+media/uapi/v4l/func-munmap userspace-api/media/v4l/func-munmap
+media/uapi/v4l/func-open userspace-api/media/v4l/func-open
+media/uapi/v4l/func-poll userspace-api/media/v4l/func-poll
+media/uapi/v4l/func-read userspace-api/media/v4l/func-read
+media/uapi/v4l/func-select userspace-api/media/v4l/func-select
+media/uapi/v4l/func-write userspace-api/media/v4l/func-write
+media/uapi/v4l/hist-v4l2 userspace-api/media/v4l/hist-v4l2
+media/uapi/v4l/hsv-formats userspace-api/media/v4l/hsv-formats
+media/uapi/v4l/io userspace-api/media/v4l/io
+media/uapi/v4l/libv4l userspace-api/media/v4l/libv4l
+media/uapi/v4l/libv4l-introduction userspace-api/media/v4l/libv4l-introduction
+media/uapi/v4l/meta-formats userspace-api/media/v4l/meta-formats
+media/uapi/v4l/mmap userspace-api/media/v4l/mmap
+media/uapi/v4l/open userspace-api/media/v4l/open
+media/uapi/v4l/pixfmt userspace-api/media/v4l/pixfmt
+media/uapi/v4l/pixfmt-002 userspace-api/media/v4l/pixfmt-v4l2
+media/uapi/v4l/pixfmt-003 userspace-api/media/v4l/pixfmt-v4l2-mplane
+media/uapi/v4l/pixfmt-004 userspace-api/media/v4l/pixfmt-intro
+media/uapi/v4l/pixfmt-006 userspace-api/media/v4l/colorspaces-defs
+media/uapi/v4l/pixfmt-007 userspace-api/media/v4l/colorspaces-details
+media/uapi/v4l/pixfmt-013 userspace-api/media/v4l/pixfmt-compressed
+media/uapi/v4l/pixfmt-bayer userspace-api/media/v4l/pixfmt-bayer
+media/uapi/v4l/pixfmt-cnf4 userspace-api/media/v4l/pixfmt-cnf4
+media/uapi/v4l/pixfmt-compressed userspace-api/media/v4l/pixfmt-compressed
+media/uapi/v4l/pixfmt-indexed userspace-api/media/v4l/pixfmt-indexed
+media/uapi/v4l/pixfmt-intro userspace-api/media/v4l/pixfmt-intro
+media/uapi/v4l/pixfmt-inzi userspace-api/media/v4l/pixfmt-inzi
+media/uapi/v4l/pixfmt-m420 userspace-api/media/v4l/pixfmt-m420
+media/uapi/v4l/pixfmt-meta-d4xx userspace-api/media/v4l/metafmt-d4xx
+media/uapi/v4l/pixfmt-meta-intel-ipu3 userspace-api/media/v4l/metafmt-intel-ipu3
+media/uapi/v4l/pixfmt-meta-uvc userspace-api/media/v4l/metafmt-uvc
+media/uapi/v4l/pixfmt-meta-vivid userspace-api/media/v4l/metafmt-vivid
+media/uapi/v4l/pixfmt-meta-vsp1-hgo userspace-api/media/v4l/metafmt-vsp1-hgo
+media/uapi/v4l/pixfmt-meta-vsp1-hgt userspace-api/media/v4l/metafmt-vsp1-hgt
+media/uapi/v4l/pixfmt-packed-hsv userspace-api/media/v4l/pixfmt-packed-hsv
+media/uapi/v4l/pixfmt-packed-yuv userspace-api/media/v4l/pixfmt-packed-yuv
+media/uapi/v4l/pixfmt-reserved userspace-api/media/v4l/pixfmt-reserved
+media/uapi/v4l/pixfmt-rgb userspace-api/media/v4l/pixfmt-rgb
+media/uapi/v4l/pixfmt-sbggr16 userspace-api/media/v4l/pixfmt-srggb16
+media/uapi/v4l/pixfmt-sdr-cs08 userspace-api/media/v4l/pixfmt-sdr-cs08
+media/uapi/v4l/pixfmt-sdr-cs14le userspace-api/media/v4l/pixfmt-sdr-cs14le
+media/uapi/v4l/pixfmt-sdr-cu08 userspace-api/media/v4l/pixfmt-sdr-cu08
+media/uapi/v4l/pixfmt-sdr-cu16le userspace-api/media/v4l/pixfmt-sdr-cu16le
+media/uapi/v4l/pixfmt-sdr-pcu16be userspace-api/media/v4l/pixfmt-sdr-pcu16be
+media/uapi/v4l/pixfmt-sdr-pcu18be userspace-api/media/v4l/pixfmt-sdr-pcu18be
+media/uapi/v4l/pixfmt-sdr-pcu20be userspace-api/media/v4l/pixfmt-sdr-pcu20be
+media/uapi/v4l/pixfmt-sdr-ru12le userspace-api/media/v4l/pixfmt-sdr-ru12le
+media/uapi/v4l/pixfmt-srggb10 userspace-api/media/v4l/pixfmt-srggb10
+media/uapi/v4l/pixfmt-srggb10-ipu3 userspace-api/media/v4l/pixfmt-srggb10-ipu3
+media/uapi/v4l/pixfmt-srggb10alaw8 userspace-api/media/v4l/pixfmt-srggb10alaw8
+media/uapi/v4l/pixfmt-srggb10dpcm8 userspace-api/media/v4l/pixfmt-srggb10dpcm8
+media/uapi/v4l/pixfmt-srggb10p userspace-api/media/v4l/pixfmt-srggb10p
+media/uapi/v4l/pixfmt-srggb12 userspace-api/media/v4l/pixfmt-srggb12
+media/uapi/v4l/pixfmt-srggb12p userspace-api/media/v4l/pixfmt-srggb12p
+media/uapi/v4l/pixfmt-srggb14 userspace-api/media/v4l/pixfmt-srggb14
+media/uapi/v4l/pixfmt-srggb14p userspace-api/media/v4l/pixfmt-srggb14p
+media/uapi/v4l/pixfmt-srggb16 userspace-api/media/v4l/pixfmt-srggb16
+media/uapi/v4l/pixfmt-srggb8 userspace-api/media/v4l/pixfmt-srggb8
+media/uapi/v4l/pixfmt-tch-td08 userspace-api/media/v4l/pixfmt-tch-td08
+media/uapi/v4l/pixfmt-tch-td16 userspace-api/media/v4l/pixfmt-tch-td16
+media/uapi/v4l/pixfmt-tch-tu08 userspace-api/media/v4l/pixfmt-tch-tu08
+media/uapi/v4l/pixfmt-tch-tu16 userspace-api/media/v4l/pixfmt-tch-tu16
+media/uapi/v4l/pixfmt-uv8 userspace-api/media/v4l/pixfmt-uv8
+media/uapi/v4l/pixfmt-v4l2 userspace-api/media/v4l/pixfmt-v4l2
+media/uapi/v4l/pixfmt-v4l2-mplane userspace-api/media/v4l/pixfmt-v4l2-mplane
+media/uapi/v4l/pixfmt-y12i userspace-api/media/v4l/pixfmt-y12i
+media/uapi/v4l/pixfmt-y8i userspace-api/media/v4l/pixfmt-y8i
+media/uapi/v4l/pixfmt-z16 userspace-api/media/v4l/pixfmt-z16
+media/uapi/v4l/planar-apis userspace-api/media/v4l/planar-apis
+media/uapi/v4l/querycap userspace-api/media/v4l/querycap
+media/uapi/v4l/rw userspace-api/media/v4l/rw
+media/uapi/v4l/sdr-formats userspace-api/media/v4l/sdr-formats
+media/uapi/v4l/selection-api userspace-api/media/v4l/selection-api
+media/uapi/v4l/selection-api-002 userspace-api/media/v4l/selection-api-intro
+media/uapi/v4l/selection-api-003 userspace-api/media/v4l/selection-api-targets
+media/uapi/v4l/selection-api-004 userspace-api/media/v4l/selection-api-configuration
+media/uapi/v4l/selection-api-005 userspace-api/media/v4l/selection-api-vs-crop-api
+media/uapi/v4l/selection-api-006 userspace-api/media/v4l/selection-api-examples
+media/uapi/v4l/selection-api-configuration userspace-api/media/v4l/selection-api-configuration
+media/uapi/v4l/selection-api-examples userspace-api/media/v4l/selection-api-examples
+media/uapi/v4l/selection-api-intro userspace-api/media/v4l/selection-api-intro
+media/uapi/v4l/selection-api-targets userspace-api/media/v4l/selection-api-targets
+media/uapi/v4l/selection-api-vs-crop-api userspace-api/media/v4l/selection-api-vs-crop-api
+media/uapi/v4l/selections-common userspace-api/media/v4l/selections-common
+media/uapi/v4l/standard userspace-api/media/v4l/standard
+media/uapi/v4l/streaming-par userspace-api/media/v4l/streaming-par
+media/uapi/v4l/subdev-formats userspace-api/media/v4l/subdev-formats
+media/uapi/v4l/tch-formats userspace-api/media/v4l/tch-formats
+media/uapi/v4l/tuner userspace-api/media/v4l/tuner
+media/uapi/v4l/user-func userspace-api/media/v4l/user-func
+media/uapi/v4l/userp userspace-api/media/v4l/userp
+media/uapi/v4l/v4l2 userspace-api/media/v4l/v4l2
+media/uapi/v4l/v4l2-selection-flags userspace-api/media/v4l/v4l2-selection-flags
+media/uapi/v4l/v4l2-selection-targets userspace-api/media/v4l/v4l2-selection-targets
+media/uapi/v4l/v4l2grab-example userspace-api/media/v4l/v4l2grab-example
+media/uapi/v4l/v4l2grab.c userspace-api/media/v4l/v4l2grab.c
+media/uapi/v4l/video userspace-api/media/v4l/video
+media/uapi/v4l/videodev userspace-api/media/v4l/videodev
+media/uapi/v4l/vidioc-create-bufs userspace-api/media/v4l/vidioc-create-bufs
+media/uapi/v4l/vidioc-cropcap userspace-api/media/v4l/vidioc-cropcap
+media/uapi/v4l/vidioc-dbg-g-chip-info userspace-api/media/v4l/vidioc-dbg-g-chip-info
+media/uapi/v4l/vidioc-dbg-g-register userspace-api/media/v4l/vidioc-dbg-g-register
+media/uapi/v4l/vidioc-decoder-cmd userspace-api/media/v4l/vidioc-decoder-cmd
+media/uapi/v4l/vidioc-dqevent userspace-api/media/v4l/vidioc-dqevent
+media/uapi/v4l/vidioc-dv-timings-cap userspace-api/media/v4l/vidioc-dv-timings-cap
+media/uapi/v4l/vidioc-encoder-cmd userspace-api/media/v4l/vidioc-encoder-cmd
+media/uapi/v4l/vidioc-enum-dv-timings userspace-api/media/v4l/vidioc-enum-dv-timings
+media/uapi/v4l/vidioc-enum-fmt userspace-api/media/v4l/vidioc-enum-fmt
+media/uapi/v4l/vidioc-enum-frameintervals userspace-api/media/v4l/vidioc-enum-frameintervals
+media/uapi/v4l/vidioc-enum-framesizes userspace-api/media/v4l/vidioc-enum-framesizes
+media/uapi/v4l/vidioc-enum-freq-bands userspace-api/media/v4l/vidioc-enum-freq-bands
+media/uapi/v4l/vidioc-enumaudio userspace-api/media/v4l/vidioc-enumaudio
+media/uapi/v4l/vidioc-enumaudioout userspace-api/media/v4l/vidioc-enumaudioout
+media/uapi/v4l/vidioc-enuminput userspace-api/media/v4l/vidioc-enuminput
+media/uapi/v4l/vidioc-enumoutput userspace-api/media/v4l/vidioc-enumoutput
+media/uapi/v4l/vidioc-enumstd userspace-api/media/v4l/vidioc-enumstd
+media/uapi/v4l/vidioc-expbuf userspace-api/media/v4l/vidioc-expbuf
+media/uapi/v4l/vidioc-g-audio userspace-api/media/v4l/vidioc-g-audio
+media/uapi/v4l/vidioc-g-audioout userspace-api/media/v4l/vidioc-g-audioout
+media/uapi/v4l/vidioc-g-crop userspace-api/media/v4l/vidioc-g-crop
+media/uapi/v4l/vidioc-g-ctrl userspace-api/media/v4l/vidioc-g-ctrl
+media/uapi/v4l/vidioc-g-dv-timings userspace-api/media/v4l/vidioc-g-dv-timings
+media/uapi/v4l/vidioc-g-edid userspace-api/media/v4l/vidioc-g-edid
+media/uapi/v4l/vidioc-g-enc-index userspace-api/media/v4l/vidioc-g-enc-index
+media/uapi/v4l/vidioc-g-ext-ctrls userspace-api/media/v4l/vidioc-g-ext-ctrls
+media/uapi/v4l/vidioc-g-fbuf userspace-api/media/v4l/vidioc-g-fbuf
+media/uapi/v4l/vidioc-g-fmt userspace-api/media/v4l/vidioc-g-fmt
+media/uapi/v4l/vidioc-g-frequency userspace-api/media/v4l/vidioc-g-frequency
+media/uapi/v4l/vidioc-g-input userspace-api/media/v4l/vidioc-g-input
+media/uapi/v4l/vidioc-g-jpegcomp userspace-api/media/v4l/vidioc-g-jpegcomp
+media/uapi/v4l/vidioc-g-modulator userspace-api/media/v4l/vidioc-g-modulator
+media/uapi/v4l/vidioc-g-output userspace-api/media/v4l/vidioc-g-output
+media/uapi/v4l/vidioc-g-parm userspace-api/media/v4l/vidioc-g-parm
+media/uapi/v4l/vidioc-g-priority userspace-api/media/v4l/vidioc-g-priority
+media/uapi/v4l/vidioc-g-selection userspace-api/media/v4l/vidioc-g-selection
+media/uapi/v4l/vidioc-g-sliced-vbi-cap userspace-api/media/v4l/vidioc-g-sliced-vbi-cap
+media/uapi/v4l/vidioc-g-std userspace-api/media/v4l/vidioc-g-std
+media/uapi/v4l/vidioc-g-tuner userspace-api/media/v4l/vidioc-g-tuner
+media/uapi/v4l/vidioc-log-status userspace-api/media/v4l/vidioc-log-status
+media/uapi/v4l/vidioc-overlay userspace-api/media/v4l/vidioc-overlay
+media/uapi/v4l/vidioc-prepare-buf userspace-api/media/v4l/vidioc-prepare-buf
+media/uapi/v4l/vidioc-qbuf userspace-api/media/v4l/vidioc-qbuf
+media/uapi/v4l/vidioc-query-dv-timings userspace-api/media/v4l/vidioc-query-dv-timings
+media/uapi/v4l/vidioc-querybuf userspace-api/media/v4l/vidioc-querybuf
+media/uapi/v4l/vidioc-querycap userspace-api/media/v4l/vidioc-querycap
+media/uapi/v4l/vidioc-queryctrl userspace-api/media/v4l/vidioc-queryctrl
+media/uapi/v4l/vidioc-querystd userspace-api/media/v4l/vidioc-querystd
+media/uapi/v4l/vidioc-reqbufs userspace-api/media/v4l/vidioc-reqbufs
+media/uapi/v4l/vidioc-s-hw-freq-seek userspace-api/media/v4l/vidioc-s-hw-freq-seek
+media/uapi/v4l/vidioc-streamon userspace-api/media/v4l/vidioc-streamon
+media/uapi/v4l/vidioc-subdev-enum-frame-interval userspace-api/media/v4l/vidioc-subdev-enum-frame-interval
+media/uapi/v4l/vidioc-subdev-enum-frame-size userspace-api/media/v4l/vidioc-subdev-enum-frame-size
+media/uapi/v4l/vidioc-subdev-enum-mbus-code userspace-api/media/v4l/vidioc-subdev-enum-mbus-code
+media/uapi/v4l/vidioc-subdev-g-crop userspace-api/media/v4l/vidioc-subdev-g-crop
+media/uapi/v4l/vidioc-subdev-g-fmt userspace-api/media/v4l/vidioc-subdev-g-fmt
+media/uapi/v4l/vidioc-subdev-g-frame-interval userspace-api/media/v4l/vidioc-subdev-g-frame-interval
+media/uapi/v4l/vidioc-subdev-g-selection userspace-api/media/v4l/vidioc-subdev-g-selection
+media/uapi/v4l/vidioc-subscribe-event userspace-api/media/v4l/vidioc-subscribe-event
+media/uapi/v4l/yuv-formats userspace-api/media/v4l/yuv-formats
+media/v4l-drivers/au0828-cardlist admin-guide/media/au0828-cardlist
+media/v4l-drivers/bttv admin-guide/media/bttv
+media/v4l-drivers/bttv-cardlist admin-guide/media/bttv-cardlist
+media/v4l-drivers/bttv-devel driver-api/media/drivers/bttv-devel
+media/v4l-drivers/cafe_ccic admin-guide/media/cafe_ccic
+media/v4l-drivers/cardlist admin-guide/media/cardlist
+media/v4l-drivers/cx2341x driver-api/media/drivers/cx2341x-devel
+media/v4l-drivers/cx2341x-devel driver-api/media/drivers/cx2341x-devel
+media/v4l-drivers/cx2341x-uapi userspace-api/media/drivers/cx2341x-uapi
+media/v4l-drivers/cx23885-cardlist admin-guide/media/cx23885-cardlist
+media/v4l-drivers/cx88 admin-guide/media/cx88
+media/v4l-drivers/cx88-cardlist admin-guide/media/cx88-cardlist
+media/v4l-drivers/cx88-devel driver-api/media/drivers/cx88-devel
+media/v4l-drivers/em28xx-cardlist admin-guide/media/em28xx-cardlist
+media/v4l-drivers/fimc admin-guide/media/fimc
+media/v4l-drivers/fimc-devel driver-api/media/drivers/fimc-devel
+media/v4l-drivers/fourcc userspace-api/media/v4l/fourcc
+media/v4l-drivers/gspca-cardlist admin-guide/media/gspca-cardlist
+media/v4l-drivers/imx admin-guide/media/imx
+media/v4l-drivers/imx-uapi userspace-api/media/drivers/imx-uapi
+media/v4l-drivers/imx7 admin-guide/media/imx7
+media/v4l-drivers/index userspace-api/media/drivers/index
+media/v4l-drivers/ipu3 admin-guide/media/ipu3
+media/v4l-drivers/ivtv admin-guide/media/ivtv
+media/v4l-drivers/ivtv-cardlist admin-guide/media/ivtv-cardlist
+media/v4l-drivers/max2175 userspace-api/media/drivers/max2175
+media/v4l-drivers/omap3isp admin-guide/media/omap3isp
+media/v4l-drivers/omap3isp-uapi userspace-api/media/drivers/omap3isp-uapi
+media/v4l-drivers/philips admin-guide/media/philips
+media/v4l-drivers/pvrusb2 driver-api/media/drivers/pvrusb2
+media/v4l-drivers/pxa_camera driver-api/media/drivers/pxa_camera
+media/v4l-drivers/qcom_camss admin-guide/media/qcom_camss
+media/v4l-drivers/radiotrack driver-api/media/drivers/radiotrack
+media/v4l-drivers/rcar-fdp1 admin-guide/media/rcar-fdp1
+media/v4l-drivers/saa7134 admin-guide/media/saa7134
+media/v4l-drivers/saa7134-cardlist admin-guide/media/saa7134-cardlist
+media/v4l-drivers/saa7134-devel driver-api/media/drivers/saa7134-devel
+media/v4l-drivers/saa7164-cardlist admin-guide/media/saa7164-cardlist
+media/v4l-drivers/sh_mobile_ceu_camera driver-api/media/drivers/sh_mobile_ceu_camera
+media/v4l-drivers/si470x admin-guide/media/si470x
+media/v4l-drivers/si4713 admin-guide/media/si4713
+media/v4l-drivers/si476x admin-guide/media/si476x
+media/v4l-drivers/tuner-cardlist admin-guide/media/tuner-cardlist
+media/v4l-drivers/tuners driver-api/media/drivers/tuners
+media/v4l-drivers/uvcvideo userspace-api/media/drivers/uvcvideo
+media/v4l-drivers/v4l-with-ir admin-guide/media/remote-controller
+media/v4l-drivers/vimc admin-guide/media/vimc
+media/v4l-drivers/vimc-devel driver-api/media/drivers/vimc-devel
+media/v4l-drivers/vivid admin-guide/media/vivid
+media/v4l-drivers/zoran driver-api/media/drivers/zoran
+memory-devices/ti-emif driver-api/memory-devices/ti-emif
+mips/booting arch/mips/booting
+mips/features arch/mips/features
+mips/index arch/mips/index
+mips/ingenic-tcu arch/mips/ingenic-tcu
+mm/slub admin-guide/mm/slab
+mmc/index driver-api/mmc/index
+mmc/mmc-async-req driver-api/mmc/mmc-async-req
+mmc/mmc-dev-attrs driver-api/mmc/mmc-dev-attrs
+mmc/mmc-dev-parts driver-api/mmc/mmc-dev-parts
+mmc/mmc-tools driver-api/mmc/mmc-tools
+mtd/index driver-api/mtd/index
+mtd/intel-spi driver-api/mtd/spi-intel
+mtd/nand_ecc driver-api/mtd/nand_ecc
+mtd/spi-nor driver-api/mtd/spi-nor
+namespaces/compatibility-list admin-guide/namespaces/compatibility-list
+namespaces/index admin-guide/namespaces/index
+namespaces/resource-control admin-guide/namespaces/resource-control
+networking/altera_tse networking/device_drivers/ethernet/altera/altera_tse
+networking/baycom networking/device_drivers/hamradio/baycom
+networking/bpf_flow_dissector bpf/prog_flow_dissector
+networking/cxacru networking/device_drivers/atm/cxacru
+networking/defza networking/device_drivers/fddi/defza
+networking/device_drivers/3com/3c509 networking/device_drivers/ethernet/3com/3c509
+networking/device_drivers/3com/vortex networking/device_drivers/ethernet/3com/vortex
+networking/device_drivers/amazon/ena networking/device_drivers/ethernet/amazon/ena
+networking/device_drivers/aquantia/atlantic networking/device_drivers/ethernet/aquantia/atlantic
+networking/device_drivers/chelsio/cxgb networking/device_drivers/ethernet/chelsio/cxgb
+networking/device_drivers/cirrus/cs89x0 networking/device_drivers/ethernet/cirrus/cs89x0
+networking/device_drivers/davicom/dm9000 networking/device_drivers/ethernet/davicom/dm9000
+networking/device_drivers/dec/dmfe networking/device_drivers/ethernet/dec/dmfe
+networking/device_drivers/dlink/dl2k networking/device_drivers/ethernet/dlink/dl2k
+networking/device_drivers/freescale/dpaa networking/device_drivers/ethernet/freescale/dpaa
+networking/device_drivers/freescale/dpaa2/dpio-driver networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver
+networking/device_drivers/freescale/dpaa2/ethernet-driver networking/device_drivers/ethernet/freescale/dpaa2/ethernet-driver
+networking/device_drivers/freescale/dpaa2/index networking/device_drivers/ethernet/freescale/dpaa2/index
+networking/device_drivers/freescale/dpaa2/mac-phy-support networking/device_drivers/ethernet/freescale/dpaa2/mac-phy-support
+networking/device_drivers/freescale/dpaa2/overview networking/device_drivers/ethernet/freescale/dpaa2/overview
+networking/device_drivers/freescale/gianfar networking/device_drivers/ethernet/freescale/gianfar
+networking/device_drivers/google/gve networking/device_drivers/ethernet/google/gve
+networking/device_drivers/intel/e100 networking/device_drivers/ethernet/intel/e100
+networking/device_drivers/intel/e1000 networking/device_drivers/ethernet/intel/e1000
+networking/device_drivers/intel/e1000e networking/device_drivers/ethernet/intel/e1000e
+networking/device_drivers/intel/fm10k networking/device_drivers/ethernet/intel/fm10k
+networking/device_drivers/intel/i40e networking/device_drivers/ethernet/intel/i40e
+networking/device_drivers/intel/iavf networking/device_drivers/ethernet/intel/iavf
+networking/device_drivers/intel/ice networking/device_drivers/ethernet/intel/ice
+networking/device_drivers/intel/igb networking/device_drivers/ethernet/intel/igb
+networking/device_drivers/intel/igbvf networking/device_drivers/ethernet/intel/igbvf
+networking/device_drivers/intel/ipw2100 networking/device_drivers/wifi/intel/ipw2100
+networking/device_drivers/intel/ipw2200 networking/device_drivers/wifi/intel/ipw2200
+networking/device_drivers/intel/ixgbe networking/device_drivers/ethernet/intel/ixgbe
+networking/device_drivers/intel/ixgbevf networking/device_drivers/ethernet/intel/ixgbevf
+networking/device_drivers/marvell/octeontx2 networking/device_drivers/ethernet/marvell/octeontx2
+networking/device_drivers/microsoft/netvsc networking/device_drivers/ethernet/microsoft/netvsc
+networking/device_drivers/neterion/s2io networking/device_drivers/ethernet/neterion/s2io
+networking/device_drivers/netronome/nfp networking/device_drivers/ethernet/netronome/nfp
+networking/device_drivers/pensando/ionic networking/device_drivers/ethernet/pensando/ionic
+networking/device_drivers/qualcomm/rmnet networking/device_drivers/cellular/qualcomm/rmnet
+networking/device_drivers/smsc/smc9 networking/device_drivers/ethernet/smsc/smc9
+networking/device_drivers/stmicro/stmmac networking/device_drivers/ethernet/stmicro/stmmac
+networking/device_drivers/ti/cpsw networking/device_drivers/ethernet/ti/cpsw
+networking/device_drivers/ti/cpsw_switchdev networking/device_drivers/ethernet/ti/cpsw_switchdev
+networking/device_drivers/ti/tlan networking/device_drivers/ethernet/ti/tlan
+networking/devlink-trap networking/devlink/devlink-trap
+networking/dpaa2/dpio-driver networking/device_drivers/ethernet/freescale/dpaa2/dpio-driver
+networking/dpaa2/ethernet-driver networking/device_drivers/ethernet/freescale/dpaa2/ethernet-driver
+networking/dpaa2/index networking/device_drivers/ethernet/freescale/dpaa2/index
+networking/dpaa2/overview networking/device_drivers/ethernet/freescale/dpaa2/overview
+networking/e100 networking/device_drivers/ethernet/intel/e100
+networking/e1000 networking/device_drivers/ethernet/intel/e1000
+networking/e1000e networking/device_drivers/ethernet/intel/e1000e
+networking/fm10k networking/device_drivers/ethernet/intel/fm10k
+networking/fore200e networking/device_drivers/atm/fore200e
+networking/hinic networking/device_drivers/ethernet/huawei/hinic
+networking/i40e networking/device_drivers/ethernet/intel/i40e
+networking/iavf networking/device_drivers/ethernet/intel/iavf
+networking/ice networking/device_drivers/ethernet/intel/ice
+networking/igb networking/device_drivers/ethernet/intel/igb
+networking/igbvf networking/device_drivers/ethernet/intel/igbvf
+networking/iphase networking/device_drivers/atm/iphase
+networking/ixgbe networking/device_drivers/ethernet/intel/ixgbe
+networking/ixgbevf networking/device_drivers/ethernet/intel/ixgbevf
+networking/netdev-FAQ process/maintainer-netdev
+networking/skfp networking/device_drivers/fddi/skfp
+networking/z8530drv networking/device_drivers/hamradio/z8530drv
+nfc/index driver-api/nfc/index
+nfc/nfc-hci driver-api/nfc/nfc-hci
+nfc/nfc-pn544 driver-api/nfc/nfc-pn544
+nios2/features arch/nios2/features
+nios2/index arch/nios2/index
+nios2/nios2 arch/nios2/nios2
+nvdimm/btt driver-api/nvdimm/btt
+nvdimm/index driver-api/nvdimm/index
+nvdimm/nvdimm driver-api/nvdimm/nvdimm
+nvdimm/security driver-api/nvdimm/security
+nvmem/nvmem driver-api/nvmem
+openrisc/features arch/openrisc/features
+openrisc/index arch/openrisc/index
+openrisc/openrisc_port arch/openrisc/openrisc_port
+openrisc/todo arch/openrisc/todo
+parisc/debugging arch/parisc/debugging
+parisc/features arch/parisc/features
+parisc/index arch/parisc/index
+parisc/registers arch/parisc/registers
+perf/arm-ccn admin-guide/perf/arm-ccn
+perf/arm_dsu_pmu admin-guide/perf/arm_dsu_pmu
+perf/hisi-pmu admin-guide/perf/hisi-pmu
+perf/index admin-guide/perf/index
+perf/qcom_l2_pmu admin-guide/perf/qcom_l2_pmu
+perf/qcom_l3_pmu admin-guide/perf/qcom_l3_pmu
+perf/thunderx2-pmu admin-guide/perf/thunderx2-pmu
+perf/xgene-pmu admin-guide/perf/xgene-pmu
+phy/samsung-usb2 driver-api/phy/samsung-usb2
+powerpc/associativity arch/powerpc/associativity
+powerpc/booting arch/powerpc/booting
+powerpc/bootwrapper arch/powerpc/bootwrapper
+powerpc/cpu_families arch/powerpc/cpu_families
+powerpc/cpu_features arch/powerpc/cpu_features
+powerpc/dawr-power9 arch/powerpc/dawr-power9
+powerpc/dexcr arch/powerpc/dexcr
+powerpc/dscr arch/powerpc/dscr
+powerpc/eeh-pci-error-recovery arch/powerpc/eeh-pci-error-recovery
+powerpc/elf_hwcaps arch/powerpc/elf_hwcaps
+powerpc/elfnote arch/powerpc/elfnote
+powerpc/features arch/powerpc/features
+powerpc/firmware-assisted-dump arch/powerpc/firmware-assisted-dump
+powerpc/hvcs arch/powerpc/hvcs
+powerpc/imc arch/powerpc/imc
+powerpc/index arch/powerpc/index
+powerpc/isa-versions arch/powerpc/isa-versions
+powerpc/kaslr-booke32 arch/powerpc/kaslr-booke32
+powerpc/mpc52xx arch/powerpc/mpc52xx
+powerpc/papr_hcalls arch/powerpc/papr_hcalls
+powerpc/pci_iov_resource_on_powernv arch/powerpc/pci_iov_resource_on_powernv
+powerpc/pmu-ebb arch/powerpc/pmu-ebb
+powerpc/ptrace arch/powerpc/ptrace
+powerpc/qe_firmware arch/powerpc/qe_firmware
+powerpc/syscall64-abi arch/powerpc/syscall64-abi
+powerpc/transactional_memory arch/powerpc/transactional_memory
+powerpc/ultravisor arch/powerpc/ultravisor
+powerpc/vas-api arch/powerpc/vas-api
+powerpc/vcpudispatch_stats arch/powerpc/vcpudispatch_stats
+powerpc/vmemmap_dedup arch/powerpc/vmemmap_dedup
+process/clang-format dev-tools/clang-format
+process/magic-number staging/magic-number
+process/unaligned-memory-access core-api/unaligned-memory-access
+rapidio/index driver-api/rapidio/index
+rapidio/mport_cdev driver-api/rapidio/mport_cdev
+rapidio/rapidio driver-api/rapidio/rapidio
+rapidio/rio_cm driver-api/rapidio/rio_cm
+rapidio/sysfs driver-api/rapidio/sysfs
+rapidio/tsi721 driver-api/rapidio/tsi721
+riscv/acpi arch/riscv/acpi
+riscv/boot arch/riscv/boot
+riscv/boot-image-header arch/riscv/boot-image-header
+riscv/features arch/riscv/features
+riscv/hwprobe arch/riscv/hwprobe
+riscv/index arch/riscv/index
+riscv/patch-acceptance arch/riscv/patch-acceptance
+riscv/uabi arch/riscv/uabi
+riscv/vector arch/riscv/vector
+riscv/vm-layout arch/riscv/vm-layout
+s390/3270 arch/s390/3270
+s390/cds arch/s390/cds
+s390/common_io arch/s390/common_io
+s390/driver-model arch/s390/driver-model
+s390/features arch/s390/features
+s390/index arch/s390/index
+s390/monreader arch/s390/monreader
+s390/pci arch/s390/pci
+s390/qeth arch/s390/qeth
+s390/s390dbf arch/s390/s390dbf
+s390/text_files arch/s390/text_files
+s390/vfio-ap arch/s390/vfio-ap
+s390/vfio-ap-locking arch/s390/vfio-ap-locking
+s390/vfio-ccw arch/s390/vfio-ccw
+s390/zfcpdump arch/s390/zfcpdump
+security/LSM security/lsm-development
+security/LSM-sctp security/SCTP
+serial/driver driver-api/serial/driver
+serial/index driver-api/serial/index
+serial/moxa-smartio driver-api/tty/moxa-smartio
+serial/n_gsm driver-api/tty/n_gsm
+serial/serial-iso7816 driver-api/serial/serial-iso7816
+serial/serial-rs485 driver-api/serial/serial-rs485
+serial/tty driver-api/tty/tty_ldisc
+sh/booting arch/sh/booting
+sh/features arch/sh/features
+sh/index arch/sh/index
+sh/new-machine arch/sh/new-machine
+sh/register-banks arch/sh/register-banks
+sparc/adi arch/sparc/adi
+sparc/console arch/sparc/console
+sparc/features arch/sparc/features
+sparc/index arch/sparc/index
+sparc/oradax/oracle-dax arch/sparc/oradax/oracle-dax
+staging/kprobes trace/kprobes
+sysctl/abi admin-guide/sysctl/abi
+sysctl/fs admin-guide/sysctl/fs
+sysctl/index admin-guide/sysctl/index
+sysctl/kernel admin-guide/sysctl/kernel
+sysctl/net admin-guide/sysctl/net
+sysctl/sunrpc admin-guide/sysctl/sunrpc
+sysctl/user admin-guide/sysctl/user
+sysctl/vm admin-guide/sysctl/vm
+thermal/cpu-cooling-api driver-api/thermal/cpu-cooling-api
+thermal/exynos_thermal driver-api/thermal/exynos_thermal
+thermal/exynos_thermal_emulation driver-api/thermal/exynos_thermal_emulation
+thermal/index driver-api/thermal/index
+thermal/intel_powerclamp admin-guide/thermal/intel_powerclamp
+thermal/nouveau_thermal driver-api/thermal/nouveau_thermal
+thermal/power_allocator driver-api/thermal/power_allocator
+thermal/sysfs-api driver-api/thermal/sysfs-api
+thermal/x86_pkg_temperature_thermal driver-api/thermal/x86_pkg_temperature_thermal
+tpm/index security/tpm/index
+tpm/tpm_vtpm_proxy security/tpm/tpm_vtpm_proxy
+trace/coresight trace/coresight/coresight
+trace/coresight-cpu-debug trace/coresight/coresight-cpu-debug
+trace/rv/da_monitor_synthesis trace/rv/monitor_synthesis
+translations/it_IT/admin-guide/security-bugs translations/it_IT/process/security-bugs
+translations/it_IT/process/clang-format translations/it_IT/dev-tools/clang-format
+translations/it_IT/process/magic-number translations/it_IT/staging/magic-number
+translations/it_IT/riscv/patch-acceptance translations/it_IT/arch/riscv/patch-acceptance
+translations/ja_JP/howto translations/ja_JP/process/howto
+translations/ko_KR/howto translations/ko_KR/process/howto
+translations/sp_SP/howto translations/sp_SP/process/howto
+translations/sp_SP/submitting-patches translations/sp_SP/process/submitting-patches
+translations/zh_CN/admin-guide/security-bugs translations/zh_CN/process/security-bugs
+translations/zh_CN/arch translations/zh_CN/arch/index
+translations/zh_CN/arm64/amu translations/zh_CN/arch/arm64/amu
+translations/zh_CN/arm64/elf_hwcaps translations/zh_CN/arch/arm64/elf_hwcaps
+translations/zh_CN/arm64/hugetlbpage translations/zh_CN/arch/arm64/hugetlbpage
+translations/zh_CN/arm64/index translations/zh_CN/arch/arm64/index
+translations/zh_CN/arm64/perf translations/zh_CN/arch/arm64/perf
+translations/zh_CN/coding-style translations/zh_CN/process/coding-style
+translations/zh_CN/loongarch/booting translations/zh_CN/arch/loongarch/booting
+translations/zh_CN/loongarch/features translations/zh_CN/arch/loongarch/features
+translations/zh_CN/loongarch/index translations/zh_CN/arch/loongarch/index
+translations/zh_CN/loongarch/introduction translations/zh_CN/arch/loongarch/introduction
+translations/zh_CN/loongarch/irq-chip-model translations/zh_CN/arch/loongarch/irq-chip-model
+translations/zh_CN/mips/booting translations/zh_CN/arch/mips/booting
+translations/zh_CN/mips/features translations/zh_CN/arch/mips/features
+translations/zh_CN/mips/index translations/zh_CN/arch/mips/index
+translations/zh_CN/mips/ingenic-tcu translations/zh_CN/arch/mips/ingenic-tcu
+translations/zh_CN/openrisc/index translations/zh_CN/arch/openrisc/index
+translations/zh_CN/openrisc/openrisc_port translations/zh_CN/arch/openrisc/openrisc_port
+translations/zh_CN/openrisc/todo translations/zh_CN/arch/openrisc/todo
+translations/zh_CN/parisc/debugging translations/zh_CN/arch/parisc/debugging
+translations/zh_CN/parisc/index translations/zh_CN/arch/parisc/index
+translations/zh_CN/parisc/registers translations/zh_CN/arch/parisc/registers
+translations/zh_CN/riscv/boot-image-header translations/zh_CN/arch/riscv/boot-image-header
+translations/zh_CN/riscv/index translations/zh_CN/arch/riscv/index
+translations/zh_CN/riscv/patch-acceptance translations/zh_CN/arch/riscv/patch-acceptance
+translations/zh_CN/riscv/vm-layout translations/zh_CN/arch/riscv/vm-layout
+translations/zh_CN/vm/active_mm translations/zh_CN/mm/active_mm
+translations/zh_CN/vm/balance translations/zh_CN/mm/balance
+translations/zh_CN/vm/damon/api translations/zh_CN/mm/damon/api
+translations/zh_CN/vm/damon/design translations/zh_CN/mm/damon/design
+translations/zh_CN/vm/damon/faq translations/zh_CN/mm/damon/faq
+translations/zh_CN/vm/damon/index translations/zh_CN/mm/damon/index
+translations/zh_CN/vm/free_page_reporting translations/zh_CN/mm/free_page_reporting
+translations/zh_CN/vm/highmem translations/zh_CN/mm/highmem
+translations/zh_CN/vm/hmm translations/zh_CN/mm/hmm
+translations/zh_CN/vm/hugetlbfs_reserv translations/zh_CN/mm/hugetlbfs_reserv
+translations/zh_CN/vm/hwpoison translations/zh_CN/mm/hwpoison
+translations/zh_CN/vm/index translations/zh_CN/mm/index
+translations/zh_CN/vm/ksm translations/zh_CN/mm/ksm
+translations/zh_CN/vm/memory-model translations/zh_CN/mm/memory-model
+translations/zh_CN/vm/mmu_notifier translations/zh_CN/mm/mmu_notifier
+translations/zh_CN/vm/numa translations/zh_CN/mm/numa
+translations/zh_CN/vm/overcommit-accounting translations/zh_CN/mm/overcommit-accounting
+translations/zh_CN/vm/page_frags translations/zh_CN/mm/page_frags
+translations/zh_CN/vm/page_owner translations/zh_CN/mm/page_owner
+translations/zh_CN/vm/page_table_check translations/zh_CN/mm/page_table_check
+translations/zh_CN/vm/remap_file_pages translations/zh_CN/mm/remap_file_pages
+translations/zh_CN/vm/split_page_table_lock translations/zh_CN/mm/split_page_table_lock
+translations/zh_CN/vm/zsmalloc translations/zh_CN/mm/zsmalloc
+translations/zh_TW/arm64/amu translations/zh_TW/arch/arm64/amu
+translations/zh_TW/arm64/elf_hwcaps translations/zh_TW/arch/arm64/elf_hwcaps
+translations/zh_TW/arm64/hugetlbpage translations/zh_TW/arch/arm64/hugetlbpage
+translations/zh_TW/arm64/index translations/zh_TW/arch/arm64/index
+translations/zh_TW/arm64/perf translations/zh_TW/arch/arm64/perf
+tty/device_drivers/oxsemi-tornado misc-devices/oxsemi-tornado
+tty/index driver-api/tty/index
+tty/n_tty driver-api/tty/n_tty
+tty/tty_buffer driver-api/tty/tty_buffer
+tty/tty_driver driver-api/tty/tty_driver
+tty/tty_internals driver-api/tty/tty_internals
+tty/tty_ldisc driver-api/tty/tty_ldisc
+tty/tty_port driver-api/tty/tty_port
+tty/tty_struct driver-api/tty/tty_struct
+usb/typec driver-api/usb/typec
+usb/usb3-debug-port driver-api/usb/usb3-debug-port
+userspace-api/media/drivers/st-vgxy61 userspace-api/media/drivers/vgxy61
+userspace-api/media/v4l/pixfmt-meta-d4xx userspace-api/media/v4l/metafmt-d4xx
+userspace-api/media/v4l/pixfmt-meta-intel-ipu3 userspace-api/media/v4l/metafmt-intel-ipu3
+userspace-api/media/v4l/pixfmt-meta-rkisp1 userspace-api/media/v4l/metafmt-rkisp1
+userspace-api/media/v4l/pixfmt-meta-uvc userspace-api/media/v4l/metafmt-uvc
+userspace-api/media/v4l/pixfmt-meta-vivid userspace-api/media/v4l/metafmt-vivid
+userspace-api/media/v4l/pixfmt-meta-vsp1-hgo userspace-api/media/v4l/metafmt-vsp1-hgo
+userspace-api/media/v4l/pixfmt-meta-vsp1-hgt userspace-api/media/v4l/metafmt-vsp1-hgt
+virt/coco/sevguest virt/coco/sev-guest
+virt/kvm/amd-memory-encryption virt/kvm/x86/amd-memory-encryption
+virt/kvm/arm/psci virt/kvm/arm/fw-pseudo-registers
+virt/kvm/cpuid virt/kvm/x86/cpuid
+virt/kvm/hypercalls virt/kvm/x86/hypercalls
+virt/kvm/mmu virt/kvm/x86/mmu
+virt/kvm/msr virt/kvm/x86/msr
+virt/kvm/nested-vmx virt/kvm/x86/nested-vmx
+virt/kvm/running-nested-guests virt/kvm/x86/running-nested-guests
+virt/kvm/s390-diag virt/kvm/s390/s390-diag
+virt/kvm/s390-pv virt/kvm/s390/s390-pv
+virt/kvm/s390-pv-boot virt/kvm/s390/s390-pv-boot
+virt/kvm/timekeeping virt/kvm/x86/timekeeping
+virt/kvm/x86/halt-polling virt/kvm/halt-polling
+virtual/index virt/index
+virtual/kvm/amd-memory-encryption virt/kvm/x86/amd-memory-encryption
+virtual/kvm/cpuid virt/kvm/x86/cpuid
+virtual/kvm/index virt/kvm/index
+virtual/kvm/vcpu-requests virt/kvm/vcpu-requests
+virtual/paravirt_ops virt/paravirt_ops
+vm/active_mm mm/active_mm
+vm/arch_pgtable_helpers mm/arch_pgtable_helpers
+vm/balance mm/balance
+vm/bootmem mm/bootmem
+vm/damon/api mm/damon/api
+vm/damon/design mm/damon/design
+vm/damon/faq mm/damon/faq
+vm/damon/index mm/damon/index
+vm/free_page_reporting mm/free_page_reporting
+vm/highmem mm/highmem
+vm/hmm mm/hmm
+vm/hugetlbfs_reserv mm/hugetlbfs_reserv
+vm/hugetlbpage admin-guide/mm/hugetlbpage
+vm/hwpoison mm/hwpoison
+vm/idle_page_tracking admin-guide/mm/idle_page_tracking
+vm/index mm/index
+vm/ksm mm/ksm
+vm/memory-model mm/memory-model
+vm/mmu_notifier mm/mmu_notifier
+vm/numa mm/numa
+vm/numa_memory_policy admin-guide/mm/numa_memory_policy
+vm/oom mm/oom
+vm/overcommit-accounting mm/overcommit-accounting
+vm/page_allocation mm/page_allocation
+vm/page_cache mm/page_cache
+vm/page_frags mm/page_frags
+vm/page_migration mm/page_migration
+vm/page_owner mm/page_owner
+vm/page_reclaim mm/page_reclaim
+vm/page_table_check mm/page_table_check
+vm/page_tables mm/page_tables
+vm/pagemap admin-guide/mm/pagemap
+vm/physical_memory mm/physical_memory
+vm/process_addrs mm/process_addrs
+vm/remap_file_pages mm/remap_file_pages
+vm/shmfs mm/shmfs
+vm/slab mm/slab
+vm/slub admin-guide/mm/slab
+vm/soft-dirty admin-guide/mm/soft-dirty
+vm/split_page_table_lock mm/split_page_table_lock
+vm/swap mm/swap
+vm/swap_numa admin-guide/mm/swap_numa
+vm/transhuge mm/transhuge
+vm/unevictable-lru mm/unevictable-lru
+vm/userfaultfd admin-guide/mm/userfaultfd
+vm/vmalloc mm/vmalloc
+vm/vmalloced-kernel-stacks mm/vmalloced-kernel-stacks
+vm/vmemmap_dedup mm/vmemmap_dedup
+vm/zsmalloc mm/zsmalloc
+vm/zswap admin-guide/mm/zswap
+watch_queue core-api/watch_queue
+x86/amd-memory-encryption arch/x86/amd-memory-encryption
+x86/amd_hsmp arch/x86/amd_hsmp
+x86/boot arch/x86/boot
+x86/booting-dt arch/x86/booting-dt
+x86/buslock arch/x86/buslock
+x86/cpuinfo arch/x86/cpuinfo
+x86/earlyprintk arch/x86/earlyprintk
+x86/elf_auxvec arch/x86/elf_auxvec
+x86/entry_64 arch/x86/entry_64
+x86/exception-tables arch/x86/exception-tables
+x86/features arch/x86/features
+x86/i386/IO-APIC arch/x86/i386/IO-APIC
+x86/i386/index arch/x86/i386/index
+x86/ifs arch/x86/ifs
+x86/index arch/x86/index
+x86/intel-hfi arch/x86/intel-hfi
+x86/intel_txt arch/x86/intel_txt
+x86/iommu arch/x86/iommu
+x86/kernel-stacks arch/x86/kernel-stacks
+x86/mds arch/x86/mds
+x86/microcode arch/x86/microcode
+x86/mtrr arch/x86/mtrr
+x86/orc-unwinder arch/x86/orc-unwinder
+x86/pat arch/x86/pat
+x86/protection-keys core-api/protection-keys
+x86/pti arch/x86/pti
+x86/resctrl filesystems/resctrl
+x86/resctrl_ui filesystems/resctrl
+x86/sgx arch/x86/sgx
+x86/sva arch/x86/sva
+x86/tdx arch/x86/tdx
+x86/tlb arch/x86/tlb
+x86/topology arch/x86/topology
+x86/tsx_async_abort arch/x86/tsx_async_abort
+x86/usb-legacy-support arch/x86/usb-legacy-support
+x86/x86_64/5level-paging arch/x86/x86_64/5level-paging
+x86/x86_64/cpu-hotplug-spec arch/x86/x86_64/cpu-hotplug-spec
+x86/x86_64/fake-numa-for-cpusets arch/x86/x86_64/fake-numa-for-cpusets
+x86/x86_64/fsgs arch/x86/x86_64/fsgs
+x86/x86_64/index arch/x86/x86_64/index
+x86/x86_64/machinecheck arch/x86/x86_64/machinecheck
+x86/x86_64/mm arch/x86/x86_64/mm
+x86/x86_64/uefi arch/x86/x86_64/uefi
+x86/xstate arch/x86/xstate
+x86/zero-page arch/x86/zero-page
+xilinx/eemi driver-api/xilinx/eemi
+xilinx/index driver-api/xilinx/index
+xtensa/atomctl arch/xtensa/atomctl
+xtensa/booting arch/xtensa/booting
+xtensa/features arch/xtensa/features
+xtensa/index arch/xtensa/index
+xtensa/mmu arch/xtensa/mmu
diff --git a/Documentation/Makefile b/Documentation/Makefile
index 820f07e0afe6..3609cb86137b 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -60,8 +60,8 @@ ifeq ($(HAVE_LATEXMK),1)
 endif #HAVE_LATEXMK
 
 # Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
+PAPEROPT_a4     = -D latex_elements.papersize=a4paper
+PAPEROPT_letter = -D latex_elements.papersize=letterpaper
 ALLSPHINXOPTS   = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
 ALLSPHINXOPTS   += $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
 ifneq ($(wildcard $(srctree)/.config),)
@@ -87,7 +87,7 @@ loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
 PYTHONPYCACHEPREFIX ?= $(abspath $(BUILDDIR)/__pycache__)
 
 quiet_cmd_sphinx = SPHINX  $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
-      cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/userspace-api/media $2 && \
+      cmd_sphinx = \
 	PYTHONPYCACHEPREFIX="$(PYTHONPYCACHEPREFIX)" \
 	BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(src)/$5/$(SPHINX_CONF)) \
 	$(PYTHON3) $(srctree)/scripts/jobserver-exec \
@@ -108,6 +108,9 @@ htmldocs:
 	@$(srctree)/scripts/sphinx-pre-install --version-check
 	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
 
+htmldocs-redirects: $(srctree)/Documentation/.renames.txt
+	@tools/docs/gen-redirects.py --output $(BUILDDIR) < $<
+
 # If Rust support is available and .config exists, add rustdoc generated contents.
 # If there are any, the errors from this make rustdoc will be displayed but
 # won't stop the execution of htmldocs
@@ -171,11 +174,11 @@ refcheckdocs:
 
 cleandocs:
 	$(Q)rm -rf $(BUILDDIR)
-	$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/userspace-api/media clean
 
 dochelp:
 	@echo  ' Linux kernel internal documentation in different formats from ReST:'
 	@echo  '  htmldocs        - HTML'
+	@echo  '  htmldocs-redirects - generate HTML redirects for moved pages'
 	@echo  '  texinfodocs     - Texinfo'
 	@echo  '  infodocs        - Info'
 	@echo  '  latexdocs       - LaTeX'
diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.rst b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
index fb73345cfb8a..e69c2872ce3b 100644
--- a/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
+++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
@@ -86,7 +86,7 @@ The <EPF Device> directory can have a list of symbolic links
 be created by the user to represent the virtual functions that are bound to
 the physical function. In the above directory structure <EPF Device 11> is a
 physical function and <EPF Device 31> is a virtual function. An EPF device once
-it's linked to another EPF device, cannot be linked to a EPC device.
+it's linked to another EPF device, cannot be linked to an EPC device.
 
 EPC Device
 ==========
@@ -108,7 +108,7 @@ entries corresponding to EPC device will be created by the EPC core.
 The <EPC Device> directory will have a list of symbolic links to
 <EPF Device>. These symbolic links should be created by the user to
 represent the functions present in the endpoint device. Only <EPF Device>
-that represents a physical function can be linked to a EPC device.
+that represents a physical function can be linked to an EPC device.
 
 The <EPC Device> directory will also have a *start* field. Once
 "1" is written to this field, the endpoint device will be ready to
diff --git a/Documentation/PCI/endpoint/pci-endpoint.rst b/Documentation/PCI/endpoint/pci-endpoint.rst
index 599763aa01ca..0741c8cbd74e 100644
--- a/Documentation/PCI/endpoint/pci-endpoint.rst
+++ b/Documentation/PCI/endpoint/pci-endpoint.rst
@@ -197,8 +197,8 @@ by the PCI endpoint function driver.
 * pci_epf_register_driver()
 
    The PCI Endpoint Function driver should implement the following ops:
-	 * bind: ops to perform when a EPC device has been bound to EPF device
-	 * unbind: ops to perform when a binding has been lost between a EPC
+	 * bind: ops to perform when an EPC device has been bound to EPF device
+	 * unbind: ops to perform when a binding has been lost between an EPC
 	   device and EPF device
 	 * add_cfs: optional ops to create function specific configfs
 	   attributes
@@ -251,7 +251,7 @@ pci-ep-cfs.c can be used as reference for using these APIs.
 * pci_epf_bind()
 
    pci_epf_bind() should be invoked when the EPF device has been bound to
-   a EPC device.
+   an EPC device.
 
 * pci_epf_unbind()
 
diff --git a/Documentation/RCU/lockdep.rst b/Documentation/RCU/lockdep.rst
index 69e73a39bd11..741b157bbacb 100644
--- a/Documentation/RCU/lockdep.rst
+++ b/Documentation/RCU/lockdep.rst
@@ -106,7 +106,7 @@ or the RCU-protected data that it points to can change concurrently.
 Like rcu_dereference(), when lockdep is enabled, RCU list and hlist
 traversal primitives check for being called from within an RCU read-side
 critical section.  However, a lockdep expression can be passed to them
-as a additional optional argument.  With this lockdep expression, these
+as an additional optional argument.  With this lockdep expression, these
 traversal primitives will complain only if the lockdep expression is
 false and they are called from outside any RCU read-side critical section.
 
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index d1ccd6039a8c..d7c8eff63317 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -119,7 +119,7 @@ warnings:
 	uncommon in large datacenter.  In one memorable case some decades
 	back, a CPU failed in a running system, becoming unresponsive,
 	but not causing an immediate crash.  This resulted in a series
-	of RCU CPU stall warnings, eventually leading the realization
+	of RCU CPU stall warnings, eventually leading to the realization
 	that the CPU had failed.
 
 The RCU, RCU-sched, RCU-tasks, and RCU-tasks-trace implementations have
diff --git a/Documentation/admin-guide/LSM/SafeSetID.rst b/Documentation/admin-guide/LSM/SafeSetID.rst
index 0ec34863c674..6d439c987563 100644
--- a/Documentation/admin-guide/LSM/SafeSetID.rst
+++ b/Documentation/admin-guide/LSM/SafeSetID.rst
@@ -41,7 +41,7 @@ namespace). The higher level goal is to allow for uid-based sandboxing of system
 services without having to give out CAP_SETUID all over the place just so that
 non-root programs can drop to even-lesser-privileged uids. This is especially
 relevant when one non-root daemon on the system should be allowed to spawn other
-processes as different uids, but its undesirable to give the daemon a
+processes as different uids, but it's undesirable to give the daemon a
 basically-root-equivalent CAP_SETUID.
 
 
diff --git a/Documentation/admin-guide/RAS/main.rst b/Documentation/admin-guide/RAS/main.rst
index 7ac1d4ccc509..447bfde509fb 100644
--- a/Documentation/admin-guide/RAS/main.rst
+++ b/Documentation/admin-guide/RAS/main.rst
@@ -253,7 +253,7 @@ interface.
 Some architectures have ECC detectors for L1, L2 and L3 caches,
 along with DMA engines, fabric switches, main data path switches,
 interconnections, and various other hardware data paths. If the hardware
-reports it, then a edac_device device probably can be constructed to
+reports it, then an edac_device device probably can be constructed to
 harvest and present that to userspace.
 
 
diff --git a/Documentation/admin-guide/aoe/udev.txt b/Documentation/admin-guide/aoe/udev.txt
index 5fb756466bc7..d55ecb411c21 100644
--- a/Documentation/admin-guide/aoe/udev.txt
+++ b/Documentation/admin-guide/aoe/udev.txt
@@ -2,7 +2,7 @@
 # They may be installed along the following lines.  Check the section
 # 8 udev manpage to see whether your udev supports SUBSYSTEM, and
 # whether it uses one or two equal signs for SUBSYSTEM and KERNEL.
-# 
+#
 #   ecashin@makki ~$ su
 #   Password:
 #   bash# find /etc -type f -name udev.conf
@@ -13,7 +13,7 @@
 #   10-wacom.rules  50-udev.rules
 #   bash# cp /path/to/linux/Documentation/admin-guide/aoe/udev.txt \
 #           /etc/udev/rules.d/60-aoe.rules
-#  
+#
 
 # aoe char devices
 SUBSYSTEM=="aoe", KERNEL=="discover",	NAME="etherd/%k", GROUP="disk", MODE="0220"
@@ -22,5 +22,5 @@ SUBSYSTEM=="aoe", KERNEL=="interfaces",	NAME="etherd/%k", GROUP="disk", MODE="02
 SUBSYSTEM=="aoe", KERNEL=="revalidate",	NAME="etherd/%k", GROUP="disk", MODE="0220"
 SUBSYSTEM=="aoe", KERNEL=="flush",	NAME="etherd/%k", GROUP="disk", MODE="0220"
 
-# aoe block devices     
+# aoe block devices
 KERNEL=="etherd*",       GROUP="disk"
diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst
index e85ad37cc0e5..b2f627d4c2f8 100644
--- a/Documentation/admin-guide/blockdev/paride.rst
+++ b/Documentation/admin-guide/blockdev/paride.rst
@@ -118,7 +118,7 @@ and high-level drivers that you would use:
 	================	============	========
 
 All parports and all protocol drivers are probed automatically unless probe=0
-parameter is used. So just "modprobe epat" is enough for a Imation SuperDisk
+parameter is used. So just "modprobe epat" is enough for an Imation SuperDisk
 drive to work.
 
 Manual device creation::
diff --git a/Documentation/admin-guide/device-mapper/delay.rst b/Documentation/admin-guide/device-mapper/delay.rst
index 4d667228e744..a1e673c0e782 100644
--- a/Documentation/admin-guide/device-mapper/delay.rst
+++ b/Documentation/admin-guide/device-mapper/delay.rst
@@ -3,7 +3,7 @@ dm-delay
 ========
 
 Device-Mapper's "delay" target delays reads and/or writes
-and/or flushs and optionally maps them to different devices.
+and/or flushes and optionally maps them to different devices.
 
 Arguments::
 
@@ -18,7 +18,7 @@ Table line has to either have 3, 6 or 9 arguments:
    to write and flush operations on optionally different write_device with
    optionally different sector offset
 
-9: same as 6 arguments plus define flush_offset and flush_delay explicitely
+9: same as 6 arguments plus define flush_offset and flush_delay explicitly
    on/with optionally different flush_device/flush_offset.
 
 Offsets are specified in sectors.
@@ -40,7 +40,7 @@ Example scripts
 	#!/bin/sh
 	#
 	# Create mapped device delaying write and flush operations for 400ms and
-	# splitting reads to device $1 but writes and flushs to different device $2
+	# splitting reads to device $1 but writes and flushes to different device $2
 	# to different offsets of 2048 and 4096 sectors respectively.
 	#
 	dmsetup create delayed --table "0 `blockdev --getsz $1` delay $1 2048 0 $2 4096 400"
@@ -48,7 +48,7 @@ Example scripts
 ::
 	#!/bin/sh
 	#
-	# Create mapped device delaying reads for 50ms, writes for 100ms and flushs for 333ms
+	# Create mapped device delaying reads for 50ms, writes for 100ms and flushes for 333ms
 	# onto the same backing device at offset 0 sectors.
 	#
 	dmsetup create delayed --table "0 `blockdev --getsz $1` delay $1 0 50 $2 0 100 $1 0 333"
diff --git a/Documentation/admin-guide/device-mapper/dm-pcache.rst b/Documentation/admin-guide/device-mapper/dm-pcache.rst
new file mode 100644
index 000000000000..09d327ef4b14
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/dm-pcache.rst
@@ -0,0 +1,202 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================
+dm-pcache — Persistent Cache
+=================================
+
+*Author: Dongsheng Yang <dongsheng.yang@linux.dev>*
+
+This document describes *dm-pcache*, a Device-Mapper target that lets a
+byte-addressable *DAX* (persistent-memory, “pmem”) region act as a
+high-performance, crash-persistent cache in front of a slower block
+device.  The code lives in `drivers/md/dm-pcache/`.
+
+Quick feature summary
+=====================
+
+* *Write-back* caching (only mode currently supported).
+* *16 MiB segments* allocated on the pmem device.
+* *Data CRC32* verification (optional, per cache).
+* Crash-safe: every metadata structure is duplicated (`PCACHE_META_INDEX_MAX
+  == 2`) and protected with CRC+sequence numbers.
+* *Multi-tree indexing* (indexing trees sharded by logical address) for high PMem parallelism
+* Pure *DAX path* I/O – no extra BIO round-trips
+* *Log-structured write-back* that preserves backend crash-consistency
+
+
+Constructor
+===========
+
+::
+
+    pcache <cache_dev> <backing_dev> [<number_of_optional_arguments> <cache_mode writeback> <data_crc true|false>]
+
+=========================  ====================================================
+``cache_dev``               Any DAX-capable block device (``/dev/pmem0``…).
+                            All metadata *and* cached blocks are stored here.
+
+``backing_dev``             The slow block device to be cached.
+
+``cache_mode``              Optional, Only ``writeback`` is accepted at the
+                            moment.
+
+``data_crc``                Optional, default to ``false``
+
+                            * ``true``  – store CRC32 for every cached entry
+			      and verify on reads
+                            * ``false`` – skip CRC (faster)
+=========================  ====================================================
+
+Example
+-------
+
+.. code-block:: shell
+
+   dmsetup create pcache_sdb --table \
+     "0 $(blockdev --getsz /dev/sdb) pcache /dev/pmem0 /dev/sdb 4 cache_mode writeback data_crc true"
+
+The first time a pmem device is used, dm-pcache formats it automatically
+(super-block, cache_info, etc.).
+
+
+Status line
+===========
+
+``dmsetup status <device>`` (``STATUSTYPE_INFO``) prints:
+
+::
+
+   <sb_flags> <seg_total> <cache_segs> <segs_used> \
+   <gc_percent> <cache_flags> \
+   <key_head_seg>:<key_head_off> \
+   <dirty_tail_seg>:<dirty_tail_off> \
+   <key_tail_seg>:<key_tail_off>
+
+Field meanings
+--------------
+
+===============================  =============================================
+``sb_flags``                     Super-block flags (e.g. endian marker).
+
+``seg_total``                    Number of physical *pmem* segments.
+
+``cache_segs``                   Number of segments used for cache.
+
+``segs_used``                    Segments currently allocated (bitmap weight).
+
+``gc_percent``                   Current GC high-water mark (0-90).
+
+``cache_flags``                  Bit 0 – DATA_CRC enabled
+                                 Bit 1 – INIT_DONE (cache initialised)
+                                 Bits 2-5 – cache mode (0 == WB).
+
+``key_head``                     Where new key-sets are being written.
+
+``dirty_tail``                   First dirty key-set that still needs
+                                 write-back to the backing device.
+
+``key_tail``                     First key-set that may be reclaimed by GC.
+===============================  =============================================
+
+
+Messages
+========
+
+*Change GC trigger*
+
+::
+
+   dmsetup message <dev> 0 gc_percent <0-90>
+
+
+Theory of operation
+===================
+
+Sub-devices
+-----------
+
+====================  =========================================================
+backing_dev             Any block device (SSD/HDD/loop/LVM, etc.).
+cache_dev               DAX device; must expose direct-access memory.
+====================  =========================================================
+
+Segments and key-sets
+---------------------
+
+* The pmem space is divided into *16 MiB segments*.
+* Each write allocates space from a per-CPU *data_head* inside a segment.
+* A *cache-key* records a logical range on the origin and where it lives
+  inside pmem (segment + offset + generation).
+* 128 keys form a *key-set* (kset); ksets are written sequentially in pmem
+  and are themselves crash-safe (CRC).
+* The pair *(key_tail, dirty_tail)* delimit clean/dirty and live/dead ksets.
+
+Write-back
+----------
+
+Dirty keys are queued into a tree; a background worker copies data
+back to the backing_dev and advances *dirty_tail*.  A FLUSH/FUA bio from the
+upper layers forces an immediate metadata commit.
+
+Garbage collection
+------------------
+
+GC starts when ``segs_used >= seg_total * gc_percent / 100``.  It walks
+from *key_tail*, frees segments whose every key has been invalidated, and
+advances *key_tail*.
+
+CRC verification
+----------------
+
+If ``data_crc is enabled`` dm-pcache computes a CRC32 over every cached data
+range when it is inserted and stores it in the on-media key.  Reads
+validate the CRC before copying to the caller.
+
+
+Failure handling
+================
+
+* *pmem media errors* – all metadata copies are read with
+  ``copy_mc_to_kernel``; an uncorrectable error logs and aborts initialisation.
+* *Cache full* – if no free segment can be found, writes return ``-EBUSY``;
+  dm-pcache retries internally (request deferral).
+* *System crash* – on attach, the driver replays ksets from *key_tail* to
+  rebuild the in-core trees; every segment’s generation guards against
+  use-after-free keys.
+
+
+Limitations & TODO
+==================
+
+* Only *write-back* mode; other modes planned.
+* Only FIFO cache invalidate; other (LRU, ARC...) planned.
+* Table reload is not supported currently.
+* Discard planned.
+
+
+Example workflow
+================
+
+.. code-block:: shell
+
+   # 1.  Create devices
+   dmsetup create pcache_sdb --table \
+     "0 $(blockdev --getsz /dev/sdb) pcache /dev/pmem0 /dev/sdb 4 cache_mode writeback data_crc true"
+
+   # 2.  Put a filesystem on top
+   mkfs.ext4 /dev/mapper/pcache_sdb
+   mount /dev/mapper/pcache_sdb /mnt
+
+   # 3.  Tune GC threshold to 80 %
+   dmsetup message pcache_sdb 0 gc_percent 80
+
+   # 4.  Observe status
+   watch -n1 'dmsetup status pcache_sdb'
+
+   # 5.  Shutdown
+   umount /mnt
+   dmsetup remove pcache_sdb
+
+
+``dm-pcache`` is under active development; feedback, bug reports and patches
+are very welcome!
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
index cc5aec861576..f1c1f4b824ba 100644
--- a/Documentation/admin-guide/device-mapper/index.rst
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -18,6 +18,7 @@ Device Mapper
     dm-integrity
     dm-io
     dm-log
+    dm-pcache
     dm-queue-length
     dm-raid
     dm-service-time
diff --git a/Documentation/admin-guide/device-mapper/vdo-design.rst b/Documentation/admin-guide/device-mapper/vdo-design.rst
index 3cd59decbec0..faa0ecd4a5ae 100644
--- a/Documentation/admin-guide/device-mapper/vdo-design.rst
+++ b/Documentation/admin-guide/device-mapper/vdo-design.rst
@@ -600,7 +600,7 @@ lock and return itself to the pool.
 All storage within vdo is managed as 4KB blocks, but it can accept writes
 as small as 512 bytes. Processing a write that is smaller than 4K requires
 a read-modify-write operation that reads the relevant 4K block, copies the
-new data over the approriate sectors of the block, and then launches a
+new data over the appropriate sectors of the block, and then launches a
 write operation for the modified data block. The read and write stages of
 this operation are nearly identical to the normal read and write
 operations, and a single data_vio is used throughout this operation.
diff --git a/Documentation/admin-guide/device-mapper/vdo.rst b/Documentation/admin-guide/device-mapper/vdo.rst
index a14e6d3e787c..8a67b320a97b 100644
--- a/Documentation/admin-guide/device-mapper/vdo.rst
+++ b/Documentation/admin-guide/device-mapper/vdo.rst
@@ -1,5 +1,6 @@
 .. SPDX-License-Identifier: GPL-2.0-only
 
+======
 dm-vdo
 ======
 
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index b857eb6ca1b6..ac0c709ea9e7 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -398,7 +398,7 @@ There are 3 different data modes:
 * writeback mode
 
   In data=writeback mode, ext4 does not journal data at all.  This mode provides
-  a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
+  a similar level of journaling as that of XFS and JFS in its default
   mode - metadata journaling.  A crash+recovery can cause incorrect data to
   appear in files which were written shortly before the crash.  This mode will
   typically provide the best ext4 performance.
diff --git a/Documentation/admin-guide/hw-vuln/mds.rst b/Documentation/admin-guide/hw-vuln/mds.rst
index 48c7b0b72aed..754679db0ce8 100644
--- a/Documentation/admin-guide/hw-vuln/mds.rst
+++ b/Documentation/admin-guide/hw-vuln/mds.rst
@@ -214,7 +214,7 @@ XEON PHI specific considerations
   command line with the 'ring3mwait=disable' command line option.
 
   XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
-  before the CPU enters a idle state. As XEON PHI is not affected by L1TF
+  before the CPU enters an idle state. As XEON PHI is not affected by L1TF
   either disabling SMT is not required for full protection.
 
 .. _mds_smt_control:
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 132e0bc6007e..991f12adef8d 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -664,7 +664,7 @@ Intel white papers:
 
 .. _spec_ref1:
 
-[1] `Intel analysis of speculative execution side channels <https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/Intel-Analysis-of-Speculative-Execution-Side-Channels.pdf>`_.
+[1] `Intel analysis of speculative execution side channels <https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/analysis-of-speculative-execution-side-channels-white-paper.pdf>`_.
 
 .. _spec_ref2:
 
@@ -682,7 +682,7 @@ AMD white papers:
 
 .. _spec_ref5:
 
-[5] `AMD64 technology indirect branch control extension <https://developer.amd.com/wp-content/resources/Architecture_Guidelines_Update_Indirect_Branch_Control.pdf>`_.
+[5] `AMD64 technology indirect branch control extension <https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/white-papers/111006-architecture-guidelines-update-amd64-technology-indirect-branch-control-extension.pdf>`_.
 
 .. _spec_ref6:
 
@@ -708,7 +708,7 @@ MIPS white paper:
 
 .. _spec_ref10:
 
-[10] `MIPS: response on speculative execution and side channel vulnerabilities <https://www.mips.com/blog/mips-response-on-speculative-execution-and-side-channel-vulnerabilities/>`_.
+[10] `MIPS: response on speculative execution and side channel vulnerabilities <https://web.archive.org/web/20220512003005if_/https://www.mips.com/blog/mips-response-on-speculative-execution-and-side-channel-vulnerabilities/>`_.
 
 Academic papers:
 
diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index 9c6cd52f69cf..7b011eb116a7 100644
--- a/Documentation/admin-guide/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
@@ -471,7 +471,7 @@ Notes on loading the dump-capture kernel:
   performance degradation. To enable multi-cpu support, you should bring up an
   SMP dump-capture kernel and specify maxcpus/nr_cpus options while loading it.
 
-* For s390x there are two kdump modes: If a ELF header is specified with
+* For s390x there are two kdump modes: If an ELF header is specified with
   the elfcorehdr= kernel parameter, it is used by the kdump kernel as it
   is done on all other architectures. If no elfcorehdr= kernel parameter is
   specified, the s390x kdump kernel dynamically creates the header. The
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 39d0e7ff0965..7bf8cc7df6b5 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
 .. _kernelparameters:
 
 The kernel's command-line parameters
@@ -213,7 +215,7 @@ need or coordination with <Documentation/arch/x86/boot.rst>.
 There are also arch-specific kernel-parameters not documented here.
 
 Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
-a trailing = on the name of any parameter states that that parameter will
+a trailing = on the name of any parameter states that the parameter will
 be entered as an environment variable, whereas its absence indicates that
 it will appear as a kernel argument readable via /proc/cmdline by programs
 running once the system is up.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 74ca438d2d6d..3edc5ce0e2a3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3705,7 +3705,7 @@
 			looking for corruption.  Enabling this will
 			both detect corruption and prevent the kernel
 			from using the memory being corrupted.
-			However, its intended as a diagnostic tool; if
+			However, it's intended as a diagnostic tool; if
 			repeatable BIOS-originated corruption always
 			affects the same memory, you can use memmap=
 			to prevent the kernel from using that memory.
@@ -7400,7 +7400,7 @@
 				(converted into nanoseconds). Fast, but
 				depending on the architecture, may not be
 				in sync between CPUs.
-			global - Event time stamps are synchronize across
+			global - Event time stamps are synchronized across
 				CPUs. May be slower than the local clock,
 				but better for some race conditions.
 			counter - Simple counting of events (1, 2, ..)
@@ -7520,12 +7520,12 @@
 			section.
 
 	trace_trigger=[trigger-list]
-			[FTRACE] Add a event trigger on specific events.
+			[FTRACE] Add an event trigger on specific events.
 			Set a trigger on top of a specific event, with an optional
 			filter.
 
-			The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..."
-			Where more than one trigger may be specified that are comma deliminated.
+			The format is "trace_trigger=<event>.<trigger>[ if <filter>],..."
+			Where more than one trigger may be specified that are comma delimited.
 
 			For example:
 
@@ -7533,7 +7533,7 @@
 
 			The above will enable the "stacktrace" trigger on the "sched_switch"
 			event but only trigger it if the "prev_state" of the "sched_switch"
-			event is "2" (TASK_UNINTERUPTIBLE).
+			event is "2" (TASK_UNINTERRUPTIBLE).
 
 			See also "Event triggers" in Documentation/trace/events.rst
 
diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst
index b61cc601d298..66eb9cd918b5 100644
--- a/Documentation/admin-guide/laptops/laptop-mode.rst
+++ b/Documentation/admin-guide/laptops/laptop-mode.rst
@@ -61,7 +61,7 @@ Caveats
   Check your drive's rating, and don't wear down your drive's lifetime if you
   don't need to.
 
-* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+* If you mount some of your ext3 filesystems with the -n option, then
   the control script will not be able to remount them correctly. You must set
   DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
   wrong options -- or it will fail because it cannot write to /etc/mtab.
@@ -96,7 +96,7 @@ control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
 dirtied are not forced to be written to disk as often. The control script also
 changes the dirty background ratio, so that background writeback of dirty pages
 is not done anymore. Combined with a higher commit value (also 10 minutes) for
-ext3 or ReiserFS filesystems (also done automatically by the control script),
+ext3 filesystem (also done automatically by the control script),
 this results in concentration of disk activity in a small time interval which
 occurs only once every 10 minutes, or whenever the disk is forced to spin up by
 a cache miss. The disk can then be spun down in the periods of inactivity.
@@ -587,7 +587,7 @@ Control script::
 					FST=$(deduce_fstype $MP)
 				fi
 				case "$FST" in
-					"ext3"|"reiserfs")
+					"ext3")
 						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
 						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
 						;;
@@ -647,7 +647,7 @@ Control script::
 					FST=$(deduce_fstype $MP)
 				fi
 				case "$FST" in
-					"ext3"|"reiserfs")
+					"ext3")
 						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
 						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
 						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
diff --git a/Documentation/admin-guide/laptops/sonypi.rst b/Documentation/admin-guide/laptops/sonypi.rst
index 190da1234314..7541f56e0007 100644
--- a/Documentation/admin-guide/laptops/sonypi.rst
+++ b/Documentation/admin-guide/laptops/sonypi.rst
@@ -25,7 +25,7 @@ generate, like:
 	  (when available)
 
 Those events (see linux/sonypi.h) can be polled using the character device node
-/dev/sonypi (major 10, minor auto allocated or specified as a option).
+/dev/sonypi (major 10, minor auto allocated or specified as an option).
 A simple daemon which translates the jogdial movements into mouse wheel events
 can be downloaded at: <http://popies.net/sonypi/>
 
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1c2eacc94758..deed823eab01 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -794,7 +794,7 @@ These currently include:
 
   journal_mode (currently raid5 only)
       The cache mode for raid5. raid5 could include an extra disk for
-      caching. The mode can be "write-throuth" and "write-back". The
+      caching. The mode can be "write-through" or "write-back". The
       default is "write-through".
 
   ppl_write_hint
diff --git a/Documentation/admin-guide/media/imx.rst b/Documentation/admin-guide/media/imx.rst
index b8fa70f854fd..bb68100d8acb 100644
--- a/Documentation/admin-guide/media/imx.rst
+++ b/Documentation/admin-guide/media/imx.rst
@@ -96,7 +96,7 @@ Some of the features of this driver include:
   motion compensation modes: low, medium, and high motion. Pipelines are
   defined that allow sending frames to the VDIC subdev directly from the
   CSI. There is also support in the future for sending frames to the
-  VDIC from memory buffers via a output/mem2mem devices.
+  VDIC from memory buffers via output/mem2mem devices.
 
 - Includes a Frame Interval Monitor (FIM) that can correct vertical sync
   problems with the ADV718x video decoders.
diff --git a/Documentation/admin-guide/media/si4713.rst b/Documentation/admin-guide/media/si4713.rst
index be8e6b49b7b4..85dcf1cd2df8 100644
--- a/Documentation/admin-guide/media/si4713.rst
+++ b/Documentation/admin-guide/media/si4713.rst
@@ -13,7 +13,7 @@ Contact: Eduardo Valentin <eduardo.valentin@nokia.com>
 Information about the Device
 ----------------------------
 
-This chip is a Silicon Labs product. It is a I2C device, currently on 0x63 address.
+This chip is a Silicon Labs product. It is an I2C device, currently on 0x63 address.
 Basically, it has transmission and signal noise level measurement features.
 
 The Si4713 integrates transmit functions for FM broadcast stereo transmission.
@@ -28,7 +28,7 @@ Users must comply with local regulations on radio frequency (RF) transmission.
 Device driver description
 -------------------------
 
-There are two modules to handle this device. One is a I2C device driver
+There are two modules to handle this device. One is an I2C device driver
 and the other is a platform driver.
 
 The I2C device driver exports a v4l2-subdev interface to the kernel.
@@ -113,7 +113,7 @@ Here is a summary of them:
 - acomp_attack_time - Sets the attack time for audio dynamic range control.
 - acomp_release_time - Sets the release time for audio dynamic range control.
 
-* Limiter setups audio deviation limiter feature. Once a over deviation occurs,
+* Limiter sets up the audio deviation limiter feature. Once an over deviation occurs,
   it is possible to adjust the front-end gain of the audio input and always
   prevent over deviation.
 
diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 2cae60b6f3ca..eae534bc1bee 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -360,7 +360,7 @@ The directory for the :ref:`quotas <damon_design_damos_quotas>` of the given
 DAMON-based operation scheme.
 
 Under ``quotas`` directory, four files (``ms``, ``bytes``,
-``reset_interval_ms``, ``effective_bytes``) and two directores (``weights`` and
+``reset_interval_ms``, ``effective_bytes``) and two directories (``weights`` and
 ``goals``) exist.
 
 You can set the ``time quota`` in milliseconds, ``size quota`` in bytes, and
diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst
index 135218f33394..06990309c6ff 100644
--- a/Documentation/admin-guide/nfs/nfsroot.rst
+++ b/Documentation/admin-guide/nfs/nfsroot.rst
@@ -342,7 +342,7 @@ They depend on various facilities being available:
 	When using pxelinux, the kernel image is specified using
 	"kernel <relative-path-below /tftpboot>". The nfsroot parameters
 	are passed to the kernel by adding them to the "append" line.
-	It is common to use serial console in conjunction with pxeliunx,
+	It is common to use serial console in conjunction with pxelinux,
 	see Documentation/admin-guide/serial-console.rst for more information.
 
 	For more information on isolinux, including how to create bootdisks
diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst
index c4c2cbbf88cb..f3d9871906a2 100644
--- a/Documentation/admin-guide/perf/hisi-pmu.rst
+++ b/Documentation/admin-guide/perf/hisi-pmu.rst
@@ -110,8 +110,8 @@ uring channel. It is 2 bits. Some important codes are as follows:
 - 2'b11: count the events which sent to the uring_ext (MATA) channel;
 - 2'b01: is the same as 2'b11;
 - 2'b10: count the events which sent to the uring (non-MATA) channel;
-- 2'b00: default value, count the events which sent to the both uring and
-  uring_ext channel;
+- 2'b00: default value, count the events which sent to both uring and
+  uring_ext channels;
 
 6. ch: NoC PMU supports filtering the event counts of certain transaction
 channel with this option. The current supported channels are as follows:
diff --git a/Documentation/admin-guide/quickly-build-trimmed-linux.rst b/Documentation/admin-guide/quickly-build-trimmed-linux.rst
index 4a5ffb0996a3..cb4b78468a93 100644
--- a/Documentation/admin-guide/quickly-build-trimmed-linux.rst
+++ b/Documentation/admin-guide/quickly-build-trimmed-linux.rst
@@ -273,7 +273,7 @@ again.
    does nothing at all; in that case you have to manually install your kernel,
    as outlined in the reference section.
 
-   If you are running a immutable Linux distribution, check its documentation
+   If you are running an immutable Linux distribution, check its documentation
    and the web to find out how to install your own kernel there.
 
    [:ref:`details<install>`]
@@ -884,7 +884,7 @@ When a build error occurs, it might be caused by some aspect of your machine's
 setup that often can be fixed quickly; other times though the problem lies in
 the code and can only be fixed by a developer. A close examination of the
 failure messages coupled with some research on the internet will often tell you
-which of the two it is. To perform such a investigation, restart the build
+which of the two it is. To perform such an investigation, restart the build
 process like this::
 
     make V=1
diff --git a/Documentation/admin-guide/reporting-issues.rst b/Documentation/admin-guide/reporting-issues.rst
index 9a847506f6ec..a68e6d909274 100644
--- a/Documentation/admin-guide/reporting-issues.rst
+++ b/Documentation/admin-guide/reporting-issues.rst
@@ -611,7 +611,7 @@ better place.
 
 How to read the MAINTAINERS file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-To illustrate how to use the :ref:`MAINTAINERS <maintainers>` file, lets assume
+To illustrate how to use the :ref:`MAINTAINERS <maintainers>` file, let's assume
 the WiFi in your Laptop suddenly misbehaves after updating the kernel. In that
 case it's likely an issue in the WiFi driver. Obviously it could also be some
 code it builds upon, but unless you suspect something like that stick to the
@@ -1543,7 +1543,7 @@ as well, because that will speed things up.
 
 And note, it helps developers a great deal if you can specify the exact version
 that introduced the problem. Hence if possible within a reasonable time frame,
-try to find that version using vanilla kernels. Lets assume something broke when
+try to find that version using vanilla kernels. Let's assume something broke when
 your distributor released a update from Linux kernel 5.10.5 to 5.10.8. Then as
 instructed above go and check the latest kernel from that version line, say
 5.10.9. If it shows the problem, try a vanilla 5.10.5 to ensure that no patches
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index 6c54718c9d04..9b7f65c3efd8 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -164,8 +164,8 @@ pipe-user-pages-soft
 --------------------
 
 Maximum total number of pages a non-privileged user may allocate for pipes
-before the pipe size gets limited to a single page. Once this limit is reached,
-new pipes will be limited to a single page in size for this user in order to
+before the pipe size gets limited to two pages. Once this limit is reached,
+new pipes will be limited to two pages in size for this user in order to
 limit total memory usage, and trying to increase them using ``fcntl()`` will be
 denied until usage goes below the limit again. The default value allows to
 allocate up to 1024 pipes at their default size. When set to 0, no limit is
diff --git a/Documentation/admin-guide/sysctl/index.rst b/Documentation/admin-guide/sysctl/index.rst
index 03346f98c7b9..4dd2c9b5d752 100644
--- a/Documentation/admin-guide/sysctl/index.rst
+++ b/Documentation/admin-guide/sysctl/index.rst
@@ -66,25 +66,31 @@ This documentation is about:
 
 =============== ===============================================================
 abi/		execution domains & personalities
-debug/		<empty>
-dev/		device specific information (eg dev/cdrom/info)
+<$ARCH>		tuning controls for various CPU architecture (e.g. csky, s390)
+crypto/		<undocumented>
+debug/		<undocumented>
+dev/		device specific information (e.g. dev/cdrom/info)
 fs/		specific filesystems
 		filehandle, inode, dentry and quota tuning
 		binfmt_misc <Documentation/admin-guide/binfmt-misc.rst>
 kernel/		global kernel info / tuning
 		miscellaneous stuff
+		some architecture-specific controls
+		security (LSM) stuff
 net/		networking stuff, for documentation look in:
 		<Documentation/networking/>
 proc/		<empty>
 sunrpc/		SUN Remote Procedure Call (NFS)
+user/		Per user namespace limits
 vm/		memory management tuning
 		buffer and cache management
-user/		Per user per user namespace limits
+xen/		<undocumented>
 =============== ===============================================================
 
-These are the subdirs I have on my system. There might be more
-or other subdirs in another setup. If you see another dir, I'd
-really like to hear about it :-)
+These are the subdirs I have on my system or have been discovered by
+searching through the source code. There might be more or other subdirs
+in another setup. If you see another dir, I'd really like to hear about
+it :-)
 
 .. toctree::
    :maxdepth: 1
diff --git a/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst b/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
index d8946b084b1e..d83601f2a459 100644
--- a/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
+++ b/Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst
@@ -1757,7 +1757,7 @@ or all of these tasks:
   to your bootloader's configuration.
 
 You have to take care of some or all of the tasks yourself, if your
-distribution lacks a installkernel script or does only handle part of them.
+distribution lacks an installkernel script or does only handle part of them.
 Consult the distribution's documentation for details. If in doubt, install the
 kernel manually::
 
diff --git a/Documentation/arch/arm/stm32/stm32f746-overview.rst b/Documentation/arch/arm/stm32/stm32f746-overview.rst
index 78befddc7740..335f0855a858 100644
--- a/Documentation/arch/arm/stm32/stm32f746-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32f746-overview.rst
@@ -15,7 +15,7 @@ It features:
 - SD/MMC/SDIO support
 - Ethernet controller
 - USB OTFG FS & HS controllers
-- I2C, SPI, CAN busses support
+- I2C, SPI, CAN buses support
 - Several 16 & 32 bits general purpose timers
 - Serial Audio interface
 - LCD controller
diff --git a/Documentation/arch/arm/stm32/stm32f769-overview.rst b/Documentation/arch/arm/stm32/stm32f769-overview.rst
index e482980ddf21..ef31aadee68f 100644
--- a/Documentation/arch/arm/stm32/stm32f769-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32f769-overview.rst
@@ -15,7 +15,7 @@ It features:
 - SD/MMC/SDIO support*2
 - Ethernet controller
 - USB OTFG FS & HS controllers
-- I2C*4, SPI*6, CAN*3 busses support
+- I2C*4, SPI*6, CAN*3 buses support
 - Several 16 & 32 bits general purpose timers
 - Serial Audio interface*2
 - LCD controller
diff --git a/Documentation/arch/arm/stm32/stm32h743-overview.rst b/Documentation/arch/arm/stm32/stm32h743-overview.rst
index 4e15f1a42730..7659df24d362 100644
--- a/Documentation/arch/arm/stm32/stm32h743-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32h743-overview.rst
@@ -15,7 +15,7 @@ It features:
 - SD/MMC/SDIO support
 - Ethernet controller
 - USB OTFG FS & HS controllers
-- I2C, SPI, CAN busses support
+- I2C, SPI, CAN buses support
 - Several 16 & 32 bits general purpose timers
 - Serial Audio interface
 - LCD controller
diff --git a/Documentation/arch/arm/stm32/stm32h750-overview.rst b/Documentation/arch/arm/stm32/stm32h750-overview.rst
index 0e51235c9547..be032b77d1f1 100644
--- a/Documentation/arch/arm/stm32/stm32h750-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32h750-overview.rst
@@ -15,7 +15,7 @@ It features:
 - SD/MMC/SDIO support
 - Ethernet controller
 - USB OTFG FS & HS controllers
-- I2C, SPI, CAN busses support
+- I2C, SPI, CAN buses support
 - Several 16 & 32 bits general purpose timers
 - Serial Audio interface
 - LCD controller
diff --git a/Documentation/arch/arm/stm32/stm32mp13-overview.rst b/Documentation/arch/arm/stm32/stm32mp13-overview.rst
index 3bb9492dad49..b5e9589fb06f 100644
--- a/Documentation/arch/arm/stm32/stm32mp13-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32mp13-overview.rst
@@ -24,7 +24,7 @@ More details:
 - ADC/DAC
 - USB EHCI/OHCI controllers
 - USB OTG
-- I2C, SPI, CAN busses support
+- I2C, SPI, CAN buses support
 - Several general purpose timers
 - Serial Audio interface
 - LCD controller
diff --git a/Documentation/arch/arm/stm32/stm32mp151-overview.rst b/Documentation/arch/arm/stm32/stm32mp151-overview.rst
index f42a2ac309c0..b58c256ede9a 100644
--- a/Documentation/arch/arm/stm32/stm32mp151-overview.rst
+++ b/Documentation/arch/arm/stm32/stm32mp151-overview.rst
@@ -23,7 +23,7 @@ More details:
 - ADC/DAC
 - USB EHCI/OHCI controllers
 - USB OTG
-- I2C, SPI busses support
+- I2C, SPI buses support
 - Several general purpose timers
 - Serial Audio interface
 - LCD-TFT controller
diff --git a/Documentation/arch/loongarch/irq-chip-model.rst b/Documentation/arch/loongarch/irq-chip-model.rst
index a7ecce11e445..8f5c3345109e 100644
--- a/Documentation/arch/loongarch/irq-chip-model.rst
+++ b/Documentation/arch/loongarch/irq-chip-model.rst
@@ -139,13 +139,13 @@ Feature EXTIOI_HAS_INT_ENCODE is part of standard EIOINTC. If it is 1, it
 indicates that CPU Interrupt Pin selection can be normal method rather than
 bitmap method, so interrupt can be routed to IP0 - IP15.
 
-Feature EXTIOI_HAS_CPU_ENCODE is entension of V-EIOINTC. If it is 1, it
+Feature EXTIOI_HAS_CPU_ENCODE is extension of V-EIOINTC. If it is 1, it
 indicates that CPU selection can be normal method rather than bitmap method,
 so interrupt can be routed to CPU0 - CPU255.
 
 EXTIOI_VIRT_CONFIG
 ------------------
-This register is read-write register, for compatibility intterupt routed uses
+This register is read-write register, for compatibility interrupt routed uses
 the default method which is the same with standard EIOINTC. If the bit is set
 with 1, it indicated HW to use normal method rather than bitmap method.
 
diff --git a/Documentation/arch/powerpc/eeh-pci-error-recovery.rst b/Documentation/arch/powerpc/eeh-pci-error-recovery.rst
index d6643a91bdf8..153d0af055b6 100644
--- a/Documentation/arch/powerpc/eeh-pci-error-recovery.rst
+++ b/Documentation/arch/powerpc/eeh-pci-error-recovery.rst
@@ -315,7 +315,6 @@ network daemons and file systems that didn't need to be disturbed.
    ideally, the reset should happen at or below the block layer,
    so that the file systems are not disturbed.
 
-   Reiserfs does not tolerate errors returned from the block device.
    Ext3fs seems to be tolerant, retrying reads/writes until it does
    succeed. Both have been only lightly tested in this scenario.
 
diff --git a/Documentation/arch/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst
index dd8b7806944e..9f2e47c4b1c8 100644
--- a/Documentation/arch/x86/cpuinfo.rst
+++ b/Documentation/arch/x86/cpuinfo.rst
@@ -11,7 +11,7 @@ The list of feature flags in /proc/cpuinfo is not complete and
 represents an ill-fated attempt from long time ago to put feature flags
 in an easy to find place for userspace.
 
-However, the amount of feature flags is growing by the CPU generation,
+However, the number of feature flags is growing with each CPU generation,
 leading to unparseable and unwieldy /proc/cpuinfo.
 
 What is more, those feature flags do not even need to be in that file
diff --git a/Documentation/conf.py b/Documentation/conf.py
index f9828f3862f9..574896cca198 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -9,6 +9,8 @@ import os
 import shutil
 import sys
 
+from  textwrap import dedent
+
 import sphinx
 
 # If extensions (or modules to document with autodoc) are in another directory,
@@ -51,11 +53,13 @@ else:
     dyn_exclude_patterns.append("devicetree/bindings/**.yaml")
     dyn_exclude_patterns.append("core-api/kho/bindings/**.yaml")
 
-# Properly handle include/exclude patterns
-# ----------------------------------------
+# Properly handle directory patterns and LaTeX docs
+# -------------------------------------------------
 
-def update_patterns(app, config):
+def config_init(app, config):
     """
+    Initialize path-dependent variabled
+
     On Sphinx, all directories are relative to what it is passed as
     SOURCEDIR parameter for sphinx-build. Due to that, all patterns
     that have directory names on it need to be dynamically set, after
@@ -86,6 +90,38 @@ def update_patterns(app, config):
 
         config.exclude_patterns.append(rel_path)
 
+    # LaTeX and PDF output require a list of documents with are dependent
+    # of the app.srcdir. Add them here
+
+    # When SPHINXDIRS is used, we just need to get index.rst, if it exists
+    if not os.path.samefile(doctree, app.srcdir):
+        doc = os.path.basename(app.srcdir)
+        fname = "index"
+        if os.path.exists(os.path.join(app.srcdir, fname + ".rst")):
+            latex_documents.append((fname, doc + ".tex",
+                                    "Linux %s Documentation" % doc.capitalize(),
+                                    "The kernel development community",
+                                    "manual"))
+            return
+
+    # When building all docs, or when a main index.rst doesn't exist, seek
+    # for it on subdirectories
+    for doc in os.listdir(app.srcdir):
+        fname = os.path.join(doc, "index")
+        if not os.path.exists(os.path.join(app.srcdir, fname + ".rst")):
+            continue
+
+        has = False
+        for l in latex_documents:
+            if l[0] == fname:
+                has = True
+                break
+
+        if not has:
+            latex_documents.append((fname, doc + ".tex",
+                                    "Linux %s Documentation" % doc.capitalize(),
+                                    "The kernel development community",
+                                    "manual"))
 
 # helper
 # ------
@@ -234,7 +270,7 @@ author = "The kernel development community"
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
-# In a normal build, version and release are are set to KERNELVERSION and
+# In a normal build, version and release are set to KERNELVERSION and
 # KERNELRELEASE, respectively, from the Makefile via Sphinx command line
 # arguments.
 #
@@ -420,19 +456,25 @@ htmlhelp_basename = "TheLinuxKerneldoc"
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
     "papersize": "a4paper",
+    "passoptionstopackages": dedent(r"""
+        \PassOptionsToPackage{svgnames}{xcolor}
+    """),
     # The font size ('10pt', '11pt' or '12pt').
     "pointsize": "11pt",
+    # Needed to generate a .ind file
+    "printindex": r"\footnotesize\raggedright\printindex",
     # Latex figure (float) alignment
     # 'figure_align': 'htbp',
     # Don't mangle with UTF-8 chars
+    "fontenc": "",
     "inputenc": "",
     "utf8extra": "",
     # Set document margins
-    "sphinxsetup": """
+    "sphinxsetup": dedent(r"""
         hmargin=0.5in, vmargin=1in,
         parsedliteralwraps=true,
         verbatimhintsturnover=false,
-    """,
+    """),
     #
     # Some of our authors are fond of deep nesting; tell latex to
     # cope.
@@ -440,48 +482,22 @@ latex_elements = {
     "maxlistdepth": "10",
     # For CJK One-half spacing, need to be in front of hyperref
     "extrapackages": r"\usepackage{setspace}",
-    # Additional stuff for the LaTeX preamble.
-    "preamble": """
-        % Use some font with UTF-8 support with XeLaTeX
-        \\usepackage{fontspec}
-        \\setsansfont{DejaVu Sans}
-        \\setromanfont{DejaVu Serif}
-        \\setmonofont{DejaVu Sans Mono}
-    """,
-}
-
-# Load kerneldoc specific LaTeX settings
-latex_elements["preamble"] += """
+    "fontpkg": dedent(r"""
+        \usepackage{fontspec}
+        \setmainfont{DejaVu Serif}
+        \setsansfont{DejaVu Sans}
+        \setmonofont{DejaVu Sans Mono}
+        \newfontfamily\headingfont{DejaVu Serif}
+    """),
+    "preamble": dedent(r"""
         % Load kerneldoc specific LaTeX settings
-        \\input{kerneldoc-preamble.sty}
-"""
+        \input{kerneldoc-preamble.sty}
+    """)
+}
 
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-# Sorted in alphabetical order
+# This will be filled up by config-inited event
 latex_documents = []
 
-# Add all other index files from Documentation/ subdirectories
-for fn in os.listdir("."):
-    doc = os.path.join(fn, "index")
-    if os.path.exists(doc + ".rst"):
-        has = False
-        for l in latex_documents:
-            if l[0] == doc:
-                has = True
-                break
-        if not has:
-            latex_documents.append(
-                (
-                    doc,
-                    fn + ".tex",
-                    "Linux %s Documentation" % fn.capitalize(),
-                    "The kernel development community",
-                    "manual",
-                )
-            )
-
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
 # latex_logo = None
@@ -577,4 +593,4 @@ loadConfig(globals())
 def setup(app):
     """Patterns need to be updated at init time on older Sphinx versions"""
 
-    app.connect('config-inited', update_patterns)
+    app.connect('config-inited', config_init)
diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst
index 3087bea715ed..ca75b3541679 100644
--- a/Documentation/core-api/dma-api.rst
+++ b/Documentation/core-api/dma-api.rst
@@ -761,7 +761,7 @@ example warning message may look like this::
 	[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
 	[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
 	[<ffffffff803c7ea3>] check_unmap+0x203/0x490
-	[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
+	[<ffffffff803c8259>] debug_dma_unmap_phys+0x49/0x50
 	[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
 	[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
 	[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
@@ -855,7 +855,7 @@ that a driver may be leaking mappings.
 dma-debug interface debug_dma_mapping_error() to debug drivers that fail
 to check DMA mapping errors on addresses returned by dma_map_single() and
 dma_map_page() interfaces. This interface clears a flag set by
-debug_dma_map_page() to indicate that dma_mapping_error() has been called by
+debug_dma_map_phys() to indicate that dma_mapping_error() has been called by
 the driver. When driver does unmap, debug_dma_unmap() checks the flag and if
 this flag is still set, prints warning message that includes call trace that
 leads up to the unmap. This interface can be called from dma_mapping_error()
diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst
index 1887d92e8e92..0bdc2be65e57 100644
--- a/Documentation/core-api/dma-attributes.rst
+++ b/Documentation/core-api/dma-attributes.rst
@@ -130,3 +130,21 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged
 subsystem that the buffer is fully accessible at the elevated privilege
 level (and ideally inaccessible or at least read-only at the
 lesser-privileged levels).
+
+DMA_ATTR_MMIO
+-------------
+
+This attribute indicates the physical address is not normal system
+memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page()
+functions, it may not be cacheable, and access using CPU load/store
+instructions may not be allowed.
+
+Usually this will be used to describe MMIO addresses, or other non-cacheable
+register addresses. When DMA mapping this sort of address we call
+the operation Peer to Peer as a one device is DMA'ing to another device.
+For PCI devices the p2pdma APIs must be used to determine if
+DMA_ATTR_MMIO is appropriate.
+
+For architectures that require cache flushing for DMA coherence
+DMA_ATTR_MMIO will not perform any cache flushing. The address
+provided must never be mapped cacheable into the CPU.
diff --git a/Documentation/core-api/folio_queue.rst b/Documentation/core-api/folio_queue.rst
index 83cfbc157e49..b7628896d2b6 100644
--- a/Documentation/core-api/folio_queue.rst
+++ b/Documentation/core-api/folio_queue.rst
@@ -44,7 +44,7 @@ Each segment in the list also stores:
  * the size of each folio and
  * three 1-bit marks per folio,
 
-but hese should not be accessed directly as the underlying data structure may
+but these should not be accessed directly as the underlying data structure may
 change, but rather the access functions outlined below should be used.
 
 The facility can be made accessible by::
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index a03a99c2cac5..6cbdcbfa79c3 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -24,6 +24,7 @@ it.
    printk-index
    symbol-namespaces
    asm-annotations
+   real-time/index
 
 Data structures and low-level utilities
 =======================================
diff --git a/Documentation/core-api/irq/irq-affinity.rst b/Documentation/core-api/irq/irq-affinity.rst
index 29da5000836a..9cb460cf60b6 100644
--- a/Documentation/core-api/irq/irq-affinity.rst
+++ b/Documentation/core-api/irq/irq-affinity.rst
@@ -9,9 +9,9 @@ ChangeLog:
 
 /proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify
 which target CPUs are permitted for a given IRQ source.  It's a bitmask
-(smp_affinity) or cpu list (smp_affinity_list) of allowed CPUs.  It's not
+(smp_affinity) or CPU list (smp_affinity_list) of allowed CPUs.  It's not
 allowed to turn off all CPUs, and if an IRQ controller does not support
-IRQ affinity then the value will not change from the default of all cpus.
+IRQ affinity then the value will not change from the default of all CPUs.
 
 /proc/irq/default_smp_affinity specifies default affinity mask that applies
 to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask
@@ -60,7 +60,7 @@ Now lets restrict that IRQ to CPU(4-7).
 This time around IRQ44 was delivered only to the last four processors.
 i.e counters for the CPU0-3 did not change.
 
-Here is an example of limiting that same irq (44) to cpus 1024 to 1031::
+Here is an example of limiting that same IRQ (44) to CPUs 1024 to 1031::
 
 	[root@moon 44]# echo 1024-1031 > smp_affinity_list
 	[root@moon 44]# cat smp_affinity_list
diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index a01c6ead1bc0..68eb2612e8a4 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -18,8 +18,8 @@ handlers as irqchips. I.e. in effect cascading interrupt controllers.
 So in the past, IRQ numbers could be chosen so that they match the
 hardware IRQ line into the root interrupt controller (i.e. the
 component actually firing the interrupt line to the CPU). Nowadays,
-this number is just a number and the number loose all kind of
-correspondence to hardware interrupt numbers.
+this number is just a number and the number has no
+relationship to hardware interrupt numbers.
 
 For this reason, we need a mechanism to separate controller-local
 interrupt numbers, called hardware IRQs, from Linux IRQ numbers.
@@ -77,15 +77,15 @@ Once a mapping has been established, it can be retrieved or used via a
 variety of methods:
 
 - irq_resolve_mapping() returns a pointer to the irq_desc structure
-  for a given domain and hwirq number, and NULL if there was no
+  for a given domain and hwirq number, or NULL if there was no
   mapping.
 - irq_find_mapping() returns a Linux IRQ number for a given domain and
-  hwirq number, and 0 if there was no mapping
+  hwirq number, or 0 if there was no mapping
 - generic_handle_domain_irq() handles an interrupt described by a
   domain and a hwirq number
 
-Note that irq domain lookups must happen in contexts that are
-compatible with a RCU read-side critical section.
+Note that irq_domain lookups must happen in contexts that are
+compatible with an RCU read-side critical section.
 
 The irq_create_mapping() function must be called *at least once*
 before any call to irq_find_mapping(), lest the descriptor will not
@@ -100,7 +100,7 @@ Types of irq_domain Mappings
 ============================
 
 There are several mechanisms available for reverse mapping from hwirq
-to Linux irq, and each mechanism uses a different allocation function.
+to Linux IRQ, and each mechanism uses a different allocation function.
 Which reverse map type should be used depends on the use case.  Each
 of the reverse map types are described below:
 
@@ -111,13 +111,13 @@ Linear
 
 	irq_domain_create_linear()
 
-The linear reverse map maintains a fixed size table indexed by the
+The linear reverse map maintains a fixed-size table indexed by the
 hwirq number.  When a hwirq is mapped, an irq_desc is allocated for
 the hwirq, and the IRQ number is stored in the table.
 
 The Linear map is a good choice when the maximum number of hwirqs is
 fixed and a relatively small number (~ < 256).  The advantages of this
-map are fixed time lookup for IRQ numbers, and irq_descs are only
+map are fixed-time lookup for IRQ numbers, and irq_descs are only
 allocated for in-use IRQs.  The disadvantage is that the table must be
 as large as the largest possible hwirq number.
 
@@ -134,7 +134,7 @@ The irq_domain maintains a radix tree map from hwirq numbers to Linux
 IRQs.  When an hwirq is mapped, an irq_desc is allocated and the
 hwirq is used as the lookup key for the radix tree.
 
-The tree map is a good choice if the hwirq number can be very large
+The Tree map is a good choice if the hwirq number can be very large
 since it doesn't need to allocate a table as large as the largest
 hwirq number.  The disadvantage is that hwirq to IRQ number lookup is
 dependent on how many entries are in the table.
@@ -169,10 +169,10 @@ Legacy
 
 The Legacy mapping is a special case for drivers that already have a
 range of irq_descs allocated for the hwirqs.  It is used when the
-driver cannot be immediately converted to use the linear mapping.  For
+driver cannot be immediately converted to use the Linear mapping.  For
 example, many embedded system board support files use a set of #defines
 for IRQ numbers that are passed to struct device registrations.  In that
-case the Linux IRQ numbers cannot be dynamically assigned and the legacy
+case the Linux IRQ numbers cannot be dynamically assigned and the Legacy
 mapping should be used.
 
 As the name implies, the \*_legacy() functions are deprecated and only
@@ -180,15 +180,15 @@ exist to ease the support of ancient platforms. No new users should be
 added. Same goes for the \*_simple() functions when their use results
 in the legacy behaviour.
 
-The legacy map assumes a contiguous range of IRQ numbers has already
+The Legacy map assumes a contiguous range of IRQ numbers has already
 been allocated for the controller and that the IRQ number can be
 calculated by adding a fixed offset to the hwirq number, and
 visa-versa.  The disadvantage is that it requires the interrupt
 controller to manage IRQ allocations and it requires an irq_desc to be
 allocated for every hwirq, even if it is unused.
 
-The legacy map should only be used if fixed IRQ mappings must be
-supported.  For example, ISA controllers would use the legacy map for
+The Legacy map should only be used if fixed IRQ mappings must be
+supported.  For example, ISA controllers would use the Legacy map for
 mapping Linux IRQs 0-15 so that existing ISA drivers get the correct IRQ
 numbers.
 
@@ -197,7 +197,7 @@ which will use a legacy domain only if an IRQ range is supplied by the
 system and will otherwise use a linear domain mapping. The semantics of
 this call are such that if an IRQ range is specified then descriptors
 will be allocated on-the-fly for it, and if no range is specified it
-will fall through to irq_domain_create_linear() which means *no* irq
+will fall through to irq_domain_create_linear() which means *no* IRQ
 descriptors will be allocated.
 
 A typical use case for simple domains is where an irqchip provider
@@ -214,7 +214,7 @@ Hierarchy IRQ Domain
 
 On some architectures, there may be multiple interrupt controllers
 involved in delivering an interrupt from the device to the target CPU.
-Let's look at a typical interrupt delivering path on x86 platforms::
+Let's look at a typical interrupt delivery path on x86 platforms::
 
   Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
 
@@ -227,8 +227,8 @@ There are three interrupt controllers involved:
 To support such a hardware topology and make software architecture match
 hardware architecture, an irq_domain data structure is built for each
 interrupt controller and those irq_domains are organized into hierarchy.
-When building irq_domain hierarchy, the irq_domain near to the device is
-child and the irq_domain near to CPU is parent. So a hierarchy structure
+When building irq_domain hierarchy, the irq_domain nearest the device is
+child and the irq_domain nearest the CPU is parent. So a hierarchy structure
 as below will be built for the example above::
 
 	CPU Vector irq_domain (root irq_domain to manage CPU vectors)
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 4b7f3646ec6c..7f2f11b48286 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -521,7 +521,7 @@ Fwnode handles
 
 	%pfw[fP]
 
-For printing information on fwnode handles. The default is to print the full
+For printing information on an fwnode_handle. The default is to print the full
 node name, including the path. The modifiers are functionally equivalent to
 %pOF above.
 
diff --git a/Documentation/core-api/real-time/architecture-porting.rst b/Documentation/core-api/real-time/architecture-porting.rst
new file mode 100644
index 000000000000..d822fac29922
--- /dev/null
+++ b/Documentation/core-api/real-time/architecture-porting.rst
@@ -0,0 +1,109 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================================
+Porting an architecture to support PREEMPT_RT
+=============================================
+
+:Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+This list outlines the architecture specific requirements that must be
+implemented in order to enable PREEMPT_RT. Once all required features are
+implemented, ARCH_SUPPORTS_RT can be selected in architecture’s Kconfig to make
+PREEMPT_RT selectable.
+Many prerequisites (genirq support for example) are enforced by the common code
+and are omitted here.
+
+The optional features are not strictly required but it is worth to consider
+them.
+
+Requirements
+------------
+
+Forced threaded interrupts
+  CONFIG_IRQ_FORCED_THREADING must be selected. Any interrupts that must
+  remain in hard-IRQ context must be marked with IRQF_NO_THREAD. This
+  requirement applies for instance to clocksource event interrupts,
+  perf interrupts and cascading interrupt-controller handlers.
+
+PREEMPTION support
+  Kernel preemption must be supported and requires that
+  CONFIG_ARCH_NO_PREEMPT remain unselected. Scheduling requests, such as those
+  issued from an interrupt or other exception handler, must be processed
+  immediately.
+
+POSIX CPU timers and KVM
+  POSIX CPU timers must expire from thread context rather than directly within
+  the timer interrupt. This behavior is enabled by setting the configuration
+  option CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK.
+  When KVM is enabled, CONFIG_KVM_XFER_TO_GUEST_WORK must also be set to ensure
+  that any pending work, such as POSIX timer expiration, is handled before
+  transitioning into guest mode.
+
+Hard-IRQ and Soft-IRQ stacks
+  Soft interrupts are handled in the thread context in which they are raised. If
+  a soft interrupt is triggered from hard-IRQ context, its execution is deferred
+  to the ksoftirqd thread. Preemption is never disabled during soft interrupt
+  handling, which makes soft interrupts preemptible.
+  If an architecture provides a custom __do_softirq() implementation that uses a
+  separate stack, it must select CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK. The
+  functionality should only be enabled when CONFIG_SOFTIRQ_ON_OWN_STACK is set.
+
+FPU and SIMD access in kernel mode
+  FPU and SIMD registers are typically not used in kernel mode and are therefore
+  not saved during kernel preemption. As a result, any kernel code that uses
+  these registers must be enclosed within a kernel_fpu_begin() and
+  kernel_fpu_end() section.
+  The kernel_fpu_begin() function usually invokes local_bh_disable() to prevent
+  interruptions from softirqs and to disable regular preemption. This allows the
+  protected code to run safely in both thread and softirq contexts.
+  On PREEMPT_RT kernels, however, kernel_fpu_begin() must not call
+  local_bh_disable(). Instead, it should use preempt_disable(), since softirqs
+  are always handled in thread context under PREEMPT_RT. In this case, disabling
+  preemption alone is sufficient.
+  The crypto subsystem operates on memory pages and requires users to "walk and
+  map" these pages while processing a request. This operation must occur outside
+  the kernel_fpu_begin()/ kernel_fpu_end() section because it requires preemption
+  to be enabled. These preemption points are generally sufficient to avoid
+  excessive scheduling latency.
+
+Exception handlers
+  Exception handlers, such as the page fault handler, typically enable interrupts
+  early, before invoking any generic code to process the exception. This is
+  necessary because handling a page fault may involve operations that can sleep.
+  Enabling interrupts is especially important on PREEMPT_RT, where certain
+  locks, such as spinlock_t, become sleepable. For example, handling an
+  invalid opcode may result in sending a SIGILL signal to the user task. A
+  debug excpetion will send a SIGTRAP signal.
+  In both cases, if the exception occurred in user space, it is safe to enable
+  interrupts early. Sending a signal requires both interrupts and kernel
+  preemption to be enabled.
+
+Optional features
+-----------------
+
+Timer and clocksource
+  A high-resolution clocksource and clockevents device are recommended. The
+  clockevents device should support the CLOCK_EVT_FEAT_ONESHOT feature for
+  optimal timer behavior. In most cases, microsecond-level accuracy is
+  sufficient
+
+Lazy preemption
+  This mechanism allows an in-kernel scheduling request for non-real-time tasks
+  to be delayed until the task is about to return to user space. It helps avoid
+  preempting a task that holds a sleeping lock at the time of the scheduling
+  request.
+  With CONFIG_GENERIC_IRQ_ENTRY enabled, supporting this feature requires
+  defining a bit for TIF_NEED_RESCHED_LAZY, preferably near TIF_NEED_RESCHED.
+
+Serial console with NBCON
+  With PREEMPT_RT enabled, all console output is handled by a dedicated thread
+  rather than directly from the context in which printk() is invoked. This design
+  allows printk() to be safely used in atomic contexts.
+  However, this also means that if the kernel crashes and cannot switch to the
+  printing thread, no output will be visible preventing the system from printing
+  its final messages.
+  There are exceptions for immediate output, such as during panic() handling. To
+  support this, the console driver must implement new-style lock handling. This
+  involves setting the CON_NBCON flag in console::flags and providing
+  implementations for the write_atomic, write_thread, device_lock, and
+  device_unlock callbacks.
diff --git a/Documentation/core-api/real-time/differences.rst b/Documentation/core-api/real-time/differences.rst
new file mode 100644
index 000000000000..83ec9aa1c61a
--- /dev/null
+++ b/Documentation/core-api/real-time/differences.rst
@@ -0,0 +1,242 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+How realtime kernels differ
+===========================
+
+:Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+Preface
+=======
+
+With forced-threaded interrupts and sleeping spin locks, code paths that
+previously caused long scheduling latencies have been made preemptible and
+moved into process context. This allows the scheduler to manage them more
+effectively and respond to higher-priority tasks with reduced latency.
+
+The following chapters provide an overview of key differences between a
+PREEMPT_RT kernel and a standard, non-PREEMPT_RT kernel.
+
+Locking
+=======
+
+Spinning locks such as spinlock_t are used to provide synchronization for data
+structures accessed from both interrupt context and process context. For this
+reason, locking functions are also available with the _irq() or _irqsave()
+suffixes, which disable interrupts before acquiring the lock. This ensures that
+the lock can be safely acquired in process context when interrupts are enabled.
+
+However, on a PREEMPT_RT system, interrupts are forced-threaded and no longer
+run in hard IRQ context. As a result, there is no need to disable interrupts as
+part of the locking procedure when using spinlock_t.
+
+For low-level core components such as interrupt handling, the scheduler, or the
+timer subsystem the kernel uses raw_spinlock_t. This lock type preserves
+traditional semantics: it disables preemption and, when used with _irq() or
+_irqsave(), also disables interrupts. This ensures proper synchronization in
+critical sections that must remain non-preemptible or with interrupts disabled.
+
+Execution context
+=================
+
+Interrupt handling in a PREEMPT_RT system is invoked in process context through
+the use of threaded interrupts. Other parts of the kernel also shift their
+execution into threaded context by different mechanisms. The goal is to keep
+execution paths preemptible, allowing the scheduler to interrupt them when a
+higher-priority task needs to run.
+
+Below is an overview of the kernel subsystems involved in this transition to
+threaded, preemptible execution.
+
+Interrupt handling
+------------------
+
+All interrupts are forced-threaded in a PREEMPT_RT system. The exceptions are
+interrupts that are requested with the IRQF_NO_THREAD, IRQF_PERCPU, or
+IRQF_ONESHOT flags.
+
+The IRQF_ONESHOT flag is used together with threaded interrupts, meaning those
+registered using request_threaded_irq() and providing only a threaded handler.
+Its purpose is to keep the interrupt line masked until the threaded handler has
+completed.
+
+If a primary handler is also provided in this case, it is essential that the
+handler does not acquire any sleeping locks, as it will not be threaded. The
+handler should be minimal and must avoid introducing delays, such as
+busy-waiting on hardware registers.
+
+
+Soft interrupts, bottom half handling
+-------------------------------------
+
+Soft interrupts are raised by the interrupt handler and are executed after the
+handler returns. Since they run in thread context, they can be preempted by
+other threads. Do not assume that softirq context runs with preemption
+disabled. This means you must not rely on mechanisms like local_bh_disable() in
+process context to protect per-CPU variables. Because softirq handlers are
+preemptible under PREEMPT_RT, this approach does not provide reliable
+synchronization.
+
+If this kind of protection is required for performance reasons, consider using
+local_lock_nested_bh(). On non-PREEMPT_RT kernels, this allows lockdep to
+verify that bottom halves are disabled. On PREEMPT_RT systems, it adds the
+necessary locking to ensure proper protection.
+
+Using local_lock_nested_bh() also makes the locking scope explicit and easier
+for readers and maintainers to understand.
+
+
+per-CPU variables
+-----------------
+
+Protecting access to per-CPU variables solely by using preempt_disable() should
+be avoided, especially if the critical section has unbounded runtime or may
+call APIs that can sleep.
+
+If using a spinlock_t is considered too costly for performance reasons,
+consider using local_lock_t. On non-PREEMPT_RT configurations, this introduces
+no runtime overhead when lockdep is disabled. With lockdep enabled, it verifies
+that the lock is only acquired in process context and never from softirq or
+hard IRQ context.
+
+On a PREEMPT_RT kernel, local_lock_t is implemented using a per-CPU spinlock_t,
+which provides safe local protection for per-CPU data while keeping the system
+preemptible.
+
+Because spinlock_t on PREEMPT_RT does not disable preemption, it cannot be used
+to protect per-CPU data by relying on implicit preemption disabling. If this
+inherited preemption disabling is essential and if local_lock_t cannot be used
+due to performance constraints, brevity of the code, or abstraction boundaries
+within an API then preempt_disable_nested() may be a suitable alternative. On
+non-PREEMPT_RT kernels, it verifies with lockdep that preemption is already
+disabled. On PREEMPT_RT, it explicitly disables preemption.
+
+Timers
+------
+
+By default, an hrtimer is executed in hard interrupt context. The exception is
+timers initialized with the HRTIMER_MODE_SOFT flag, which are executed in
+softirq context.
+
+On a PREEMPT_RT kernel, this behavior is reversed: hrtimers are executed in
+softirq context by default, typically within the ktimersd thread. This thread
+runs at the lowest real-time priority, ensuring it executes before any
+SCHED_OTHER tasks but does not interfere with higher-priority real-time
+threads. To explicitly request execution in hard interrupt context on
+PREEMPT_RT, the timer must be marked with the HRTIMER_MODE_HARD flag.
+
+Memory allocation
+-----------------
+
+The memory allocation APIs, such as kmalloc() and alloc_pages(), require a
+gfp_t flag to indicate the allocation context. On non-PREEMPT_RT kernels, it is
+necessary to use GFP_ATOMIC when allocating memory from interrupt context or
+from sections where preemption is disabled. This is because the allocator must
+not sleep in these contexts waiting for memory to become available.
+
+However, this approach does not work on PREEMPT_RT kernels. The memory
+allocator in PREEMPT_RT uses sleeping locks internally, which cannot be
+acquired when preemption is disabled. Fortunately, this is generally not a
+problem, because PREEMPT_RT moves most contexts that would traditionally run
+with preemption or interrupts disabled into threaded context, where sleeping is
+allowed.
+
+What remains problematic is code that explicitly disables preemption or
+interrupts. In such cases, memory allocation must be performed outside the
+critical section.
+
+This restriction also applies to memory deallocation routines such as kfree()
+and free_pages(), which may also involve internal locking and must not be
+called from non-preemptible contexts.
+
+IRQ work
+--------
+
+The irq_work API provides a mechanism to schedule a callback in interrupt
+context. It is designed for use in contexts where traditional scheduling is not
+possible, such as from within NMI handlers or from inside the scheduler, where
+using a workqueue would be unsafe.
+
+On non-PREEMPT_RT systems, all irq_work items are executed immediately in
+interrupt context. Items marked with IRQ_WORK_LAZY are deferred until the next
+timer tick but are still executed in interrupt context.
+
+On PREEMPT_RT systems, the execution model changes. Because irq_work callbacks
+may acquire sleeping locks or have unbounded execution time, they are handled
+in thread context by a per-CPU irq_work kernel thread. This thread runs at the
+lowest real-time priority, ensuring it executes before any SCHED_OTHER tasks
+but does not interfere with higher-priority real-time threads.
+
+The exception are work items marked with IRQ_WORK_HARD_IRQ, which are still
+executed in hard interrupt context. Lazy items (IRQ_WORK_LAZY) continue to be
+deferred until the next timer tick and are also executed by the irq_work/
+thread.
+
+RCU callbacks
+-------------
+
+RCU callbacks are invoked by default in softirq context. Their execution is
+important because, depending on the use case, they either free memory or ensure
+progress in state transitions. Running these callbacks as part of the softirq
+chain can lead to undesired situations, such as contention for CPU resources
+with other SCHED_OTHER tasks when executed within ksoftirqd.
+
+To avoid running callbacks in softirq context, the RCU subsystem provides a
+mechanism to execute them in process context instead. This behavior can be
+enabled by setting the boot command-line parameter rcutree.use_softirq=0. This
+setting is enforced in kernels configured with PREEMPT_RT.
+
+Spin until ready
+================
+
+The "spin until ready" pattern involves repeatedly checking (spinning on) the
+state of a data structure until it becomes available. This pattern assumes that
+preemption, soft interrupts, or interrupts are disabled. If the data structure
+is marked busy, it is presumed to be in use by another CPU, and spinning should
+eventually succeed as that CPU makes progress.
+
+Some examples are hrtimer_cancel() or timer_delete_sync(). These functions
+cancel timers that execute with interrupts or soft interrupts disabled. If a
+thread attempts to cancel a timer and finds it active, spinning until the
+callback completes is safe because the callback can only run on another CPU and
+will eventually finish.
+
+On PREEMPT_RT kernels, however, timer callbacks run in thread context. This
+introduces a challenge: a higher-priority thread attempting to cancel the timer
+may preempt the timer callback thread. Since the scheduler cannot migrate the
+callback thread to another CPU due to affinity constraints, spinning can result
+in livelock even on multiprocessor systems.
+
+To avoid this, both the canceling and callback sides must use a handshake
+mechanism that supports priority inheritance. This allows the canceling thread
+to suspend until the callback completes, ensuring forward progress without
+risking livelock.
+
+In order to solve the problem at the API level, the sequence locks were extended
+to allow a proper handover between the the spinning reader and the maybe
+blocked writer.
+
+Sequence locks
+--------------
+
+Sequence counters and sequential locks are documented in
+Documentation/locking/seqlock.rst.
+
+The interface has been extended to ensure proper preemption states for the
+writer and spinning reader contexts. This is achieved by embedding the writer
+serialization lock directly into the sequence counter type, resulting in
+composite types such as seqcount_spinlock_t or seqcount_mutex_t.
+
+These composite types allow readers to detect an ongoing write and actively
+boost the writer’s priority to help it complete its update instead of spinning
+and waiting for its completion.
+
+If the plain seqcount_t is used, extra care must be taken to synchronize the
+reader with the writer during updates. The writer must ensure its update is
+serialized and non-preemptible relative to the reader. This cannot be achieved
+using a regular spinlock_t because spinlock_t on PREEMPT_RT does not disable
+preemption. In such cases, using seqcount_spinlock_t is the preferred solution.
+
+However, if there is no spinning involved i.e., if the reader only needs to
+detect whether a write has started and not serialize against it then using
+seqcount_t is reasonable.
diff --git a/Documentation/core-api/real-time/index.rst b/Documentation/core-api/real-time/index.rst
new file mode 100644
index 000000000000..7e14c4ea3d59
--- /dev/null
+++ b/Documentation/core-api/real-time/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Real-time preemption
+=====================
+
+This documentation is intended for Linux kernel developers and contributors
+interested in the inner workings of PREEMPT_RT. It explains key concepts and
+the required changes compared to a non-PREEMPT_RT configuration.
+
+.. toctree::
+   :maxdepth: 2
+
+   theory
+   differences
+   architecture-porting
diff --git a/Documentation/core-api/real-time/theory.rst b/Documentation/core-api/real-time/theory.rst
new file mode 100644
index 000000000000..43d0120737f8
--- /dev/null
+++ b/Documentation/core-api/real-time/theory.rst
@@ -0,0 +1,116 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Theory of operation
+=====================
+
+:Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+Preface
+=======
+
+PREEMPT_RT transforms the Linux kernel into a real-time kernel. It achieves
+this by replacing locking primitives, such as spinlock_t, with a preemptible
+and priority-inheritance aware implementation known as rtmutex, and by enforcing
+the use of threaded interrupts. As a result, the kernel becomes fully
+preemptible, with the exception of a few critical code paths, including entry
+code, the scheduler, and low-level interrupt handling routines.
+
+This transformation places the majority of kernel execution contexts under the
+control of the scheduler and significantly increasing the number of preemption
+points. Consequently, it reduces the latency between a high-priority task
+becoming runnable and its actual execution on the CPU.
+
+Scheduling
+==========
+
+The core principles of Linux scheduling and the associated user-space API are
+documented in the man page sched(7)
+`sched(7) <https://man7.org/linux/man-pages/man7/sched.7.html>`_.
+By default, the Linux kernel uses the SCHED_OTHER scheduling policy. Under
+this policy, a task is preempted when the scheduler determines that it has
+consumed a fair share of CPU time relative to other runnable tasks. However,
+the policy does not guarantee immediate preemption when a new SCHED_OTHER task
+becomes runnable. The currently running task may continue executing.
+
+This behavior differs from that of real-time scheduling policies such as
+SCHED_FIFO. When a task with a real-time policy becomes runnable, the
+scheduler immediately selects it for execution if it has a higher priority than
+the currently running task. The task continues to run until it voluntarily
+yields the CPU, typically by blocking on an event.
+
+Sleeping spin locks
+===================
+
+The various lock types and their behavior under real-time configurations are
+described in detail in Documentation/locking/locktypes.rst.
+In a non-PREEMPT_RT configuration, a spinlock_t is acquired by first disabling
+preemption and then actively spinning until the lock becomes available. Once
+the lock is released, preemption is enabled. From a real-time perspective,
+this approach is undesirable because disabling preemption prevents the
+scheduler from switching to a higher-priority task, potentially increasing
+latency.
+
+To address this, PREEMPT_RT replaces spinning locks with sleeping spin locks
+that do not disable preemption. On PREEMPT_RT, spinlock_t is implemented using
+rtmutex. Instead of spinning, a task attempting to acquire a contended lock
+disables CPU migration, donates its priority to the lock owner (priority
+inheritance), and voluntarily schedules out while waiting for the lock to
+become available.
+
+Disabling CPU migration provides the same effect as disabling preemption, while
+still allowing preemption and ensuring that the task continues to run on the
+same CPU while holding a sleeping lock.
+
+Priority inheritance
+====================
+
+Lock types such as spinlock_t and mutex_t in a PREEMPT_RT enabled kernel are
+implemented on top of rtmutex, which provides support for priority inheritance
+(PI). When a task blocks on such a lock, the PI mechanism temporarily
+propagates the blocked task’s scheduling parameters to the lock owner.
+
+For example, if a SCHED_FIFO task A blocks on a lock currently held by a
+SCHED_OTHER task B, task A’s scheduling policy and priority are temporarily
+inherited by task B. After this inheritance, task A is put to sleep while
+waiting for the lock, and task B effectively becomes the highest-priority task
+in the system. This allows B to continue executing, make progress, and
+eventually release the lock.
+
+Once B releases the lock, it reverts to its original scheduling parameters, and
+task A can resume execution.
+
+Threaded interrupts
+===================
+
+Interrupt handlers are another source of code that executes with preemption
+disabled and outside the control of the scheduler. To bring interrupt handling
+under scheduler control, PREEMPT_RT enforces threaded interrupt handlers.
+
+With forced threading, interrupt handling is split into two stages. The first
+stage, the primary handler, is executed in IRQ context with interrupts disabled.
+Its sole responsibility is to wake the associated threaded handler. The second
+stage, the threaded handler, is the function passed to request_irq() as the
+interrupt handler. It runs in process context, scheduled by the kernel.
+
+From waking the interrupt thread until threaded handling is completed, the
+interrupt source is masked in the interrupt controller. This ensures that the
+device interrupt remains pending but does not retrigger the CPU, allowing the
+system to exit IRQ context and handle the interrupt in a scheduled thread.
+
+By default, the threaded handler executes with the SCHED_FIFO scheduling policy
+and a priority of 50 (MAX_RT_PRIO / 2), which is midway between the minimum and
+maximum real-time priorities.
+
+If the threaded interrupt handler raises any soft interrupts during its
+execution, those soft interrupt routines are invoked after the threaded handler
+completes, within the same thread. Preemption remains enabled during the
+execution of the soft interrupt handler.
+
+Summary
+=======
+
+By using sleeping locks and forced-threaded interrupts, PREEMPT_RT
+significantly reduces sections of code where interrupts or preemption is
+disabled, allowing the scheduler to preempt the current execution context and
+switch to a higher-priority task.
diff --git a/Documentation/dev-tools/autofdo.rst b/Documentation/dev-tools/autofdo.rst
index 1f0a451e9ccd..bcf06e7d6ffa 100644
--- a/Documentation/dev-tools/autofdo.rst
+++ b/Documentation/dev-tools/autofdo.rst
@@ -131,11 +131,11 @@ Here is an example workflow for AutoFDO kernel:
 
      For Zen3::
 
-      $ cat proc/cpuinfo | grep " brs"
+      $ cat /proc/cpuinfo | grep " brs"
 
      For Zen4::
 
-      $ cat proc/cpuinfo | grep amd_lbr_v2
+      $ cat /proc/cpuinfo | grep amd_lbr_v2
 
      The following command generated the perf data file::
 
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index 65c54b27a60b..4b8425e348ab 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -29,6 +29,7 @@ Documentation/process/debugging/index.rst
    ubsan
    kmemleak
    kcsan
+   lkmm/index
    kfence
    kselftest
    kunit/index
diff --git a/Documentation/dev-tools/ktap.rst b/Documentation/dev-tools/ktap.rst
index 414c105b10a9..a9810bed5fd4 100644
--- a/Documentation/dev-tools/ktap.rst
+++ b/Documentation/dev-tools/ktap.rst
@@ -5,7 +5,7 @@ The Kernel Test Anything Protocol (KTAP), version 1
 ===================================================
 
 TAP, or the Test Anything Protocol is a format for specifying test results used
-by a number of projects. It's website and specification are found at this `link
+by a number of projects. Its website and specification are found at this `link
 <https://testanything.org/>`_. The Linux Kernel largely uses TAP output for test
 results. However, Kernel testing frameworks have special needs for test results
 which don't align with the original TAP specification. Thus, a "Kernel TAP"
@@ -20,6 +20,7 @@ machine-readable, whereas the diagnostic data is unstructured and is there to
 aid human debugging.
 
 KTAP output is built from four different types of lines:
+
 - Version lines
 - Plan lines
 - Test case result lines
@@ -38,6 +39,7 @@ All KTAP-formatted results begin with a "version line" which specifies which
 version of the (K)TAP standard the result is compliant with.
 
 For example:
+
 - "KTAP version 1"
 - "TAP version 13"
 - "TAP version 14"
@@ -276,6 +278,7 @@ Example KTAP output
 This output defines the following hierarchy:
 
 A single test called "main_test", which fails, and has three subtests:
+
 - "example_test_1", which passes, and has one subtest:
 
    - "test_1", which passes, and outputs the diagnostic message "test_1: initializing test_1"
diff --git a/Documentation/dev-tools/lkmm/docs/access-marking.rst b/Documentation/dev-tools/lkmm/docs/access-marking.rst
new file mode 100644
index 000000000000..80058a4da980
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/access-marking.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Access Marking
+--------------
+
+Literal include of ``tools/memory-model/Documentation/access-marking.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/access-marking.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/cheatsheet.rst b/Documentation/dev-tools/lkmm/docs/cheatsheet.rst
new file mode 100644
index 000000000000..37681f6a6a8c
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/cheatsheet.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Cheatsheet
+----------
+
+Literal include of ``tools/memory-model/Documentation/cheatsheet.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/cheatsheet.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/control-dependencies.rst b/Documentation/dev-tools/lkmm/docs/control-dependencies.rst
new file mode 100644
index 000000000000..5ae97e8861eb
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/control-dependencies.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Control Dependencies
+--------------------
+
+Literal include of ``tools/memory-model/Documentation/control-dependencies.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/control-dependencies.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/explanation.rst b/Documentation/dev-tools/lkmm/docs/explanation.rst
new file mode 100644
index 000000000000..0bcba9a5ddf7
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/explanation.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Explanation
+-----------
+
+Literal include of ``tools/memory-model/Documentation/explanation.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/explanation.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/glossary.rst b/Documentation/dev-tools/lkmm/docs/glossary.rst
new file mode 100644
index 000000000000..849aefdf3d6e
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/glossary.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Glossary
+--------
+
+Literal include of ``tools/memory-model/Documentation/glossary.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/glossary.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/herd-representation.rst b/Documentation/dev-tools/lkmm/docs/herd-representation.rst
new file mode 100644
index 000000000000..f7b41f286eb9
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/herd-representation.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+herd-representation
+-------------------
+
+Literal include of ``tools/memory-model/Documentation/herd-representation.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/herd-representation.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/index.rst b/Documentation/dev-tools/lkmm/docs/index.rst
new file mode 100644
index 000000000000..abbddcc009de
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/index.rst
@@ -0,0 +1,21 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Documentation
+=============
+
+.. toctree::
+   :maxdepth: 1
+
+   readme
+   simple
+   ordering
+   litmus-tests
+   locking
+   recipes
+   control-dependencies
+   access-marking
+   cheatsheet
+   explanation
+   herd-representation
+   glossary
+   references
diff --git a/Documentation/dev-tools/lkmm/docs/litmus-tests.rst b/Documentation/dev-tools/lkmm/docs/litmus-tests.rst
new file mode 100644
index 000000000000..3293f4540156
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/litmus-tests.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Litmus Tests
+------------
+
+Literal include of ``tools/memory-model/Documentation/litmus-tests.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/litmus-tests.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/locking.rst b/Documentation/dev-tools/lkmm/docs/locking.rst
new file mode 100644
index 000000000000..b5eae4c0acb7
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/locking.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Locking
+-------
+
+Literal include of ``tools/memory-model/Documentation/locking.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/locking.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/ordering.rst b/Documentation/dev-tools/lkmm/docs/ordering.rst
new file mode 100644
index 000000000000..a2343c12462d
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/ordering.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Ordering
+--------
+
+Literal include of ``tools/memory-model/Documentation/ordering.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/ordering.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/readme.rst b/Documentation/dev-tools/lkmm/docs/readme.rst
new file mode 100644
index 000000000000..51e7a64e4435
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/readme.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+README (for LKMM Documentation)
+-------------------------------
+
+Literal include of ``tools/memory-model/Documentation/README``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/README
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/recipes.rst b/Documentation/dev-tools/lkmm/docs/recipes.rst
new file mode 100644
index 000000000000..e55952640047
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/recipes.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Recipes
+-------
+
+Literal include of ``tools/memory-model/Documentation/recipes.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/recipes.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/references.rst b/Documentation/dev-tools/lkmm/docs/references.rst
new file mode 100644
index 000000000000..c6831b3c9c02
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/references.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+References
+----------
+
+Literal include of ``tools/memory-model/Documentation/references.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/references.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/docs/simple.rst b/Documentation/dev-tools/lkmm/docs/simple.rst
new file mode 100644
index 000000000000..5c1094c95f45
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/docs/simple.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Simple
+------
+
+Literal include of ``tools/memory-model/Documentation/simple.txt``.
+
+------------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/Documentation/simple.txt
+   :literal:
diff --git a/Documentation/dev-tools/lkmm/index.rst b/Documentation/dev-tools/lkmm/index.rst
new file mode 100644
index 000000000000..e52782449ca3
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/index.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================
+Linux Kernel Memory Consistency Model (LKMM)
+============================================
+
+This section literally renders documents under ``tools/memory-model/``
+and ``tools/memory-model/Documentation/``, which are maintained in
+the *pure* plain text form.
+
+.. toctree::
+   :maxdepth: 2
+
+   readme
+   docs/index
diff --git a/Documentation/dev-tools/lkmm/readme.rst b/Documentation/dev-tools/lkmm/readme.rst
new file mode 100644
index 000000000000..a7f847109584
--- /dev/null
+++ b/Documentation/dev-tools/lkmm/readme.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+README (for LKMM)
+=================
+
+Literal include of ``tools/memory-model/README``.
+
+------------------------------------------------------------
+
+.. kernel-include:: tools/memory-model/README
+   :literal:
diff --git a/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml b/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml
new file mode 100644
index 000000000000..277af48ac841
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/axis,artpec8-clock.yaml
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/axis,artpec8-clock.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Axis ARTPEC-8 SoC clock controller
+
+maintainers:
+  - Jesper Nilsson <jesper.nilsson@axis.com>
+
+description: |
+  ARTPEC-8 clock controller is comprised of several CMU (Clock Management Unit)
+  units, generating clocks for different domains. Those CMU units are modeled
+  as separate device tree nodes, and might depend on each other.
+  The root clock in that root tree is an external clock: OSCCLK (25 MHz).
+  This external clock must be defined as a fixed-rate clock in dts.
+
+  CMU_CMU is a top-level CMU, where all base clocks are prepared using PLLs and
+  dividers; all other clocks of function blocks (other CMUs) are usually
+  derived from CMU_CMU.
+
+  Each clock is assigned an identifier and client nodes can use this identifier
+  to specify the clock which they consume. All clocks available for usage
+  in clock consumer nodes are defined as preprocessor macros in
+  'include/dt-bindings/clock/axis,artpec8-clk.h' header.
+
+properties:
+  compatible:
+    enum:
+      - axis,artpec8-cmu-cmu
+      - axis,artpec8-cmu-bus
+      - axis,artpec8-cmu-core
+      - axis,artpec8-cmu-cpucl
+      - axis,artpec8-cmu-fsys
+      - axis,artpec8-cmu-imem
+      - axis,artpec8-cmu-peri
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    minItems: 1
+    maxItems: 5
+
+  clock-names:
+    minItems: 1
+    maxItems: 5
+
+  "#clock-cells":
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - "#clock-cells"
+
+allOf:
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-cmu
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+
+        clock-names:
+          items:
+            - const: fin_pll
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-bus
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_BUS BUS clock (from CMU_CMU)
+            - description: CMU_BUS DLP clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: bus
+            - const: dlp
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-core
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_CORE main clock (from CMU_CMU)
+            - description: CMU_CORE DLP clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: main
+            - const: dlp
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-cpucl
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_CPUCL switch clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: switch
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-fsys
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_FSYS SCAN0 clock (from CMU_CMU)
+            - description: CMU_FSYS SCAN1 clock (from CMU_CMU)
+            - description: CMU_FSYS BUS clock (from CMU_CMU)
+            - description: CMU_FSYS IP clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: scan0
+            - const: scan1
+            - const: bus
+            - const: ip
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-imem
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_IMEM ACLK clock (from CMU_CMU)
+            - description: CMU_IMEM JPEG clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: aclk
+            - const: jpeg
+
+  - if:
+      properties:
+        compatible:
+          const: axis,artpec8-cmu-peri
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (25 MHz)
+            - description: CMU_PERI IP clock (from CMU_CMU)
+            - description: CMU_PERI AUDIO clock (from CMU_CMU)
+            - description: CMU_PERI DISP clock (from CMU_CMU)
+
+        clock-names:
+          items:
+            - const: fin_pll
+            - const: ip
+            - const: audio
+            - const: disp
+
+additionalProperties: false
+
+examples:
+  # Clock controller node for CMU_FSYS
+  - |
+    #include <dt-bindings/clock/axis,artpec8-clk.h>
+
+    cmu_fsys: clock-controller@16c10000 {
+        compatible = "axis,artpec8-cmu-fsys";
+        reg = <0x16c10000 0x4000>;
+        #clock-cells = <1>;
+        clocks = <&fin_pll>,
+                 <&cmu_cmu CLK_DOUT_CMU_FSYS_SCAN0>,
+                 <&cmu_cmu CLK_DOUT_CMU_FSYS_SCAN1>,
+                 <&cmu_cmu CLK_DOUT_CMU_FSYS_BUS>,
+                 <&cmu_cmu CLK_DOUT_CMU_FSYS_IP>;
+        clock-names = "fin_pll", "scan0", "scan1", "bus", "ip";
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/submitting-patches.rst b/Documentation/devicetree/bindings/submitting-patches.rst
index 5f24570f72e9..ce767b1eccf2 100644
--- a/Documentation/devicetree/bindings/submitting-patches.rst
+++ b/Documentation/devicetree/bindings/submitting-patches.rst
@@ -66,7 +66,7 @@ I. For patch submitters
      any DTS patches, regardless whether using existing or new bindings, should
      be placed at the end of patchset to indicate no dependency of drivers on
      the DTS.  DTS will be anyway applied through separate tree or branch, so
-     different order would indicate the serie is non-bisectable.
+     different order would indicate the series is non-bisectable.
 
      If a driver subsystem maintainer prefers to apply entire set, instead of
      their relevant portion of patchset, please split the DTS patches into
diff --git a/Documentation/devicetree/bindings/ufs/qcom,sc7180-ufshc.yaml b/Documentation/devicetree/bindings/ufs/qcom,sc7180-ufshc.yaml
new file mode 100644
index 000000000000..d94ef4e6b85a
--- /dev/null
+++ b/Documentation/devicetree/bindings/ufs/qcom,sc7180-ufshc.yaml
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ufs/qcom,sc7180-ufshc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SC7180 and Other SoCs UFS Controllers
+
+maintainers:
+  - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+# Select only our matches, not all jedec,ufs-2.0
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - qcom,msm8998-ufshc
+          - qcom,qcs8300-ufshc
+          - qcom,sa8775p-ufshc
+          - qcom,sc7180-ufshc
+          - qcom,sc7280-ufshc
+          - qcom,sc8180x-ufshc
+          - qcom,sc8280xp-ufshc
+          - qcom,sm8250-ufshc
+          - qcom,sm8350-ufshc
+          - qcom,sm8450-ufshc
+          - qcom,sm8550-ufshc
+  required:
+    - compatible
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - qcom,msm8998-ufshc
+          - qcom,qcs8300-ufshc
+          - qcom,sa8775p-ufshc
+          - qcom,sc7180-ufshc
+          - qcom,sc7280-ufshc
+          - qcom,sc8180x-ufshc
+          - qcom,sc8280xp-ufshc
+          - qcom,sm8250-ufshc
+          - qcom,sm8350-ufshc
+          - qcom,sm8450-ufshc
+          - qcom,sm8550-ufshc
+      - const: qcom,ufshc
+      - const: jedec,ufs-2.0
+
+  reg:
+    maxItems: 1
+
+  reg-names:
+    items:
+      - const: std
+
+  clocks:
+    minItems: 7
+    maxItems: 8
+
+  clock-names:
+    minItems: 7
+    items:
+      - const: core_clk
+      - const: bus_aggr_clk
+      - const: iface_clk
+      - const: core_clk_unipro
+      - const: ref_clk
+      - const: tx_lane0_sync_clk
+      - const: rx_lane0_sync_clk
+      - const: rx_lane1_sync_clk
+
+  qcom,ice:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description: phandle to the Inline Crypto Engine node
+
+required:
+  - compatible
+  - reg
+
+allOf:
+  - $ref: qcom,ufs-common.yaml
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,sc7180-ufshc
+    then:
+      properties:
+        clocks:
+          maxItems: 7
+        clock-names:
+          maxItems: 7
+    else:
+      properties:
+        clocks:
+          minItems: 8
+        clock-names:
+          minItems: 8
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interconnect/qcom,sm8450.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        ufs@1d84000 {
+            compatible = "qcom,sm8450-ufshc", "qcom,ufshc",
+                         "jedec,ufs-2.0";
+            reg = <0x0 0x01d84000 0x0 0x3000>;
+            interrupts = <GIC_SPI 265 IRQ_TYPE_LEVEL_HIGH>;
+            phys = <&ufs_mem_phy_lanes>;
+            phy-names = "ufsphy";
+            lanes-per-direction = <2>;
+            #reset-cells = <1>;
+            resets = <&gcc GCC_UFS_PHY_BCR>;
+            reset-names = "rst";
+            reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+            vcc-supply = <&vreg_l7b_2p5>;
+            vcc-max-microamp = <1100000>;
+            vccq-supply = <&vreg_l9b_1p2>;
+            vccq-max-microamp = <1200000>;
+
+            power-domains = <&gcc UFS_PHY_GDSC>;
+            iommus = <&apps_smmu 0xe0 0x0>;
+            interconnects = <&aggre1_noc MASTER_UFS_MEM &mc_virt SLAVE_EBI1>,
+                            <&gem_noc MASTER_APPSS_PROC &config_noc SLAVE_UFS_MEM_CFG>;
+            interconnect-names = "ufs-ddr", "cpu-ufs";
+
+            clocks = <&gcc GCC_UFS_PHY_AXI_CLK>,
+                     <&gcc GCC_AGGRE_UFS_PHY_AXI_CLK>,
+                     <&gcc GCC_UFS_PHY_AHB_CLK>,
+                     <&gcc GCC_UFS_PHY_UNIPRO_CORE_CLK>,
+                     <&rpmhcc RPMH_CXO_CLK>,
+                     <&gcc GCC_UFS_PHY_TX_SYMBOL_0_CLK>,
+                     <&gcc GCC_UFS_PHY_RX_SYMBOL_0_CLK>,
+                     <&gcc GCC_UFS_PHY_RX_SYMBOL_1_CLK>;
+            clock-names = "core_clk",
+                          "bus_aggr_clk",
+                          "iface_clk",
+                          "core_clk_unipro",
+                          "ref_clk",
+                          "tx_lane0_sync_clk",
+                          "rx_lane0_sync_clk",
+                          "rx_lane1_sync_clk";
+            freq-table-hz = <75000000 300000000>,
+                            <0 0>,
+                            <0 0>,
+                            <75000000 300000000>,
+                            <75000000 300000000>,
+                            <0 0>,
+                            <0 0>,
+                            <0 0>;
+            qcom,ice = <&ice>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/ufs/qcom,sm8650-ufshc.yaml b/Documentation/devicetree/bindings/ufs/qcom,sm8650-ufshc.yaml
new file mode 100644
index 000000000000..aaa0bbb5bfe1
--- /dev/null
+++ b/Documentation/devicetree/bindings/ufs/qcom,sm8650-ufshc.yaml
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ufs/qcom,sm8650-ufshc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm SM8650 and Other SoCs UFS Controllers
+
+maintainers:
+  - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+# Select only our matches, not all jedec,ufs-2.0
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - qcom,sm8650-ufshc
+          - qcom,sm8750-ufshc
+  required:
+    - compatible
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - qcom,sm8650-ufshc
+          - qcom,sm8750-ufshc
+      - const: qcom,ufshc
+      - const: jedec,ufs-2.0
+
+  reg:
+    minItems: 1
+    maxItems: 2
+
+  reg-names:
+    minItems: 1
+    items:
+      - const: std
+      - const: mcq
+
+  clocks:
+    minItems: 8
+    maxItems: 8
+
+  clock-names:
+    items:
+      - const: core_clk
+      - const: bus_aggr_clk
+      - const: iface_clk
+      - const: core_clk_unipro
+      - const: ref_clk
+      - const: tx_lane0_sync_clk
+      - const: rx_lane0_sync_clk
+      - const: rx_lane1_sync_clk
+
+  qcom,ice:
+    $ref: /schemas/types.yaml#/definitions/phandle
+    description: phandle to the Inline Crypto Engine node
+
+required:
+  - compatible
+  - reg
+
+allOf:
+  - $ref: qcom,ufs-common.yaml
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,sm8650-gcc.h>
+    #include <dt-bindings/clock/qcom,sm8650-tcsr.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/gpio/gpio.h>
+    #include <dt-bindings/interconnect/qcom,icc.h>
+    #include <dt-bindings/interconnect/qcom,sm8650-rpmh.h>
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        ufshc@1d84000 {
+            compatible = "qcom,sm8650-ufshc", "qcom,ufshc", "jedec,ufs-2.0";
+            reg = <0x0 0x01d84000 0x0 0x3000>;
+
+            interrupts = <GIC_SPI 265 IRQ_TYPE_LEVEL_HIGH 0>;
+
+            clocks = <&gcc GCC_UFS_PHY_AXI_CLK>,
+                     <&gcc GCC_AGGRE_UFS_PHY_AXI_CLK>,
+                     <&gcc GCC_UFS_PHY_AHB_CLK>,
+                     <&gcc GCC_UFS_PHY_UNIPRO_CORE_CLK>,
+                     <&tcsr TCSR_UFS_PAD_CLKREF_EN>,
+                     <&gcc GCC_UFS_PHY_TX_SYMBOL_0_CLK>,
+                     <&gcc GCC_UFS_PHY_RX_SYMBOL_0_CLK>,
+                     <&gcc GCC_UFS_PHY_RX_SYMBOL_1_CLK>;
+            clock-names = "core_clk",
+                          "bus_aggr_clk",
+                          "iface_clk",
+                          "core_clk_unipro",
+                          "ref_clk",
+                          "tx_lane0_sync_clk",
+                          "rx_lane0_sync_clk",
+                          "rx_lane1_sync_clk";
+
+            resets = <&gcc GCC_UFS_PHY_BCR>;
+            reset-names = "rst";
+            reset-gpios = <&tlmm 210 GPIO_ACTIVE_LOW>;
+
+            interconnects = <&aggre1_noc MASTER_UFS_MEM QCOM_ICC_TAG_ALWAYS
+                             &mc_virt SLAVE_EBI1 QCOM_ICC_TAG_ALWAYS>,
+                            <&gem_noc MASTER_APPSS_PROC QCOM_ICC_TAG_ACTIVE_ONLY
+                             &config_noc SLAVE_UFS_MEM_CFG QCOM_ICC_TAG_ACTIVE_ONLY>;
+            interconnect-names = "ufs-ddr",
+                         "cpu-ufs";
+
+            power-domains = <&gcc UFS_PHY_GDSC>;
+            required-opps = <&rpmhpd_opp_nom>;
+
+            operating-points-v2 = <&ufs_opp_table>;
+
+            iommus = <&apps_smmu 0x60 0>;
+
+            lanes-per-direction = <2>;
+            qcom,ice = <&ice>;
+
+            phys = <&ufs_mem_phy>;
+            phy-names = "ufsphy";
+
+            #reset-cells = <1>;
+
+            vcc-supply = <&vreg_l7b_2p5>;
+            vcc-max-microamp = <1100000>;
+            vccq-supply = <&vreg_l9b_1p2>;
+            vccq-max-microamp = <1200000>;
+
+            ufs_opp_table: opp-table {
+                compatible = "operating-points-v2";
+
+                opp-100000000 {
+                    opp-hz = /bits/ 64 <100000000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <100000000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>;
+                    required-opps = <&rpmhpd_opp_low_svs>;
+                };
+
+                opp-201500000 {
+                    opp-hz = /bits/ 64 <201500000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <201500000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>;
+                    required-opps = <&rpmhpd_opp_svs>;
+                };
+
+                opp-403000000 {
+                    opp-hz = /bits/ 64 <403000000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <403000000>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>,
+                             /bits/ 64 <0>;
+                    required-opps = <&rpmhpd_opp_nom>;
+                };
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs-common.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs-common.yaml
new file mode 100644
index 000000000000..962dffcd28b4
--- /dev/null
+++ b/Documentation/devicetree/bindings/ufs/qcom,ufs-common.yaml
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/ufs/qcom,ufs-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Universal Flash Storage (UFS) Controller Common Properties
+
+maintainers:
+  - Bjorn Andersson <bjorn.andersson@linaro.org>
+
+properties:
+  clocks:
+    minItems: 7
+    maxItems: 9
+
+  clock-names:
+    minItems: 7
+    maxItems: 9
+
+  dma-coherent: true
+
+  interconnects:
+    minItems: 2
+    maxItems: 2
+
+  interconnect-names:
+    items:
+      - const: ufs-ddr
+      - const: cpu-ufs
+
+  iommus:
+    minItems: 1
+    maxItems: 2
+
+  phys:
+    maxItems: 1
+
+  phy-names:
+    items:
+      - const: ufsphy
+
+  power-domains:
+    maxItems: 1
+
+  required-opps:
+    maxItems: 1
+
+  resets:
+    maxItems: 1
+
+  '#reset-cells':
+    const: 1
+
+  reset-names:
+    items:
+      - const: rst
+
+  reset-gpios:
+    maxItems: 1
+    description:
+      GPIO connected to the RESET pin of the UFS memory device.
+
+allOf:
+  - $ref: ufs-common.yaml
+
+additionalProperties: true
diff --git a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
index 6c6043d9809e..1dd41f6d5258 100644
--- a/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
+++ b/Documentation/devicetree/bindings/ufs/qcom,ufs.yaml
@@ -15,7 +15,15 @@ select:
   properties:
     compatible:
       contains:
-        const: qcom,ufshc
+        enum:
+          - qcom,msm8994-ufshc
+          - qcom,msm8996-ufshc
+          - qcom,qcs615-ufshc
+          - qcom,sdm845-ufshc
+          - qcom,sm6115-ufshc
+          - qcom,sm6125-ufshc
+          - qcom,sm6350-ufshc
+          - qcom,sm8150-ufshc
   required:
     - compatible
 
@@ -25,61 +33,15 @@ properties:
       - enum:
           - qcom,msm8994-ufshc
           - qcom,msm8996-ufshc
-          - qcom,msm8998-ufshc
           - qcom,qcs615-ufshc
-          - qcom,qcs8300-ufshc
-          - qcom,sa8775p-ufshc
-          - qcom,sc7180-ufshc
-          - qcom,sc7280-ufshc
-          - qcom,sc8180x-ufshc
-          - qcom,sc8280xp-ufshc
           - qcom,sdm845-ufshc
           - qcom,sm6115-ufshc
           - qcom,sm6125-ufshc
           - qcom,sm6350-ufshc
           - qcom,sm8150-ufshc
-          - qcom,sm8250-ufshc
-          - qcom,sm8350-ufshc
-          - qcom,sm8450-ufshc
-          - qcom,sm8550-ufshc
-          - qcom,sm8650-ufshc
-          - qcom,sm8750-ufshc
       - const: qcom,ufshc
       - const: jedec,ufs-2.0
 
-  clocks:
-    minItems: 7
-    maxItems: 9
-
-  clock-names:
-    minItems: 7
-    maxItems: 9
-
-  dma-coherent: true
-
-  interconnects:
-    minItems: 2
-    maxItems: 2
-
-  interconnect-names:
-    items:
-      - const: ufs-ddr
-      - const: cpu-ufs
-
-  iommus:
-    minItems: 1
-    maxItems: 2
-
-  phys:
-    maxItems: 1
-
-  phy-names:
-    items:
-      - const: ufsphy
-
-  power-domains:
-    maxItems: 1
-
   qcom,ice:
     $ref: /schemas/types.yaml#/definitions/phandle
     description: phandle to the Inline Crypto Engine node
@@ -93,93 +55,12 @@ properties:
       - const: std
       - const: ice
 
-  required-opps:
-    maxItems: 1
-
-  resets:
-    maxItems: 1
-
-  '#reset-cells':
-    const: 1
-
-  reset-names:
-    items:
-      - const: rst
-
-  reset-gpios:
-    maxItems: 1
-    description:
-      GPIO connected to the RESET pin of the UFS memory device.
-
 required:
   - compatible
   - reg
 
 allOf:
-  - $ref: ufs-common.yaml
-
-  - if:
-      properties:
-        compatible:
-          contains:
-            enum:
-              - qcom,sc7180-ufshc
-    then:
-      properties:
-        clocks:
-          minItems: 7
-          maxItems: 7
-        clock-names:
-          items:
-            - const: core_clk
-            - const: bus_aggr_clk
-            - const: iface_clk
-            - const: core_clk_unipro
-            - const: ref_clk
-            - const: tx_lane0_sync_clk
-            - const: rx_lane0_sync_clk
-        reg:
-          maxItems: 1
-        reg-names:
-          maxItems: 1
-
-  - if:
-      properties:
-        compatible:
-          contains:
-            enum:
-              - qcom,msm8998-ufshc
-              - qcom,qcs8300-ufshc
-              - qcom,sa8775p-ufshc
-              - qcom,sc7280-ufshc
-              - qcom,sc8180x-ufshc
-              - qcom,sc8280xp-ufshc
-              - qcom,sm8250-ufshc
-              - qcom,sm8350-ufshc
-              - qcom,sm8450-ufshc
-              - qcom,sm8550-ufshc
-              - qcom,sm8650-ufshc
-              - qcom,sm8750-ufshc
-    then:
-      properties:
-        clocks:
-          minItems: 8
-          maxItems: 8
-        clock-names:
-          items:
-            - const: core_clk
-            - const: bus_aggr_clk
-            - const: iface_clk
-            - const: core_clk_unipro
-            - const: ref_clk
-            - const: tx_lane0_sync_clk
-            - const: rx_lane0_sync_clk
-            - const: rx_lane1_sync_clk
-        reg:
-          minItems: 1
-          maxItems: 1
-        reg-names:
-          maxItems: 1
+  - $ref: qcom,ufs-common.yaml
 
   - if:
       properties:
@@ -297,10 +178,10 @@ unevaluatedProperties: false
 
 examples:
   - |
-    #include <dt-bindings/clock/qcom,gcc-sm8450.h>
+    #include <dt-bindings/clock/qcom,gcc-sm8150.h>
     #include <dt-bindings/clock/qcom,rpmh.h>
     #include <dt-bindings/gpio/gpio.h>
-    #include <dt-bindings/interconnect/qcom,sm8450.h>
+    #include <dt-bindings/interconnect/qcom,sm8150.h>
     #include <dt-bindings/interrupt-controller/arm-gic.h>
 
     soc {
@@ -308,9 +189,12 @@ examples:
         #size-cells = <2>;
 
         ufs@1d84000 {
-            compatible = "qcom,sm8450-ufshc", "qcom,ufshc",
+            compatible = "qcom,sm8150-ufshc", "qcom,ufshc",
                          "jedec,ufs-2.0";
-            reg = <0 0x01d84000 0 0x3000>;
+            reg = <0x0 0x01d84000 0x0 0x2500>,
+                  <0x0 0x01d90000 0x0 0x8000>;
+            reg-names = "std", "ice";
+
             interrupts = <GIC_SPI 265 IRQ_TYPE_LEVEL_HIGH>;
             phys = <&ufs_mem_phy_lanes>;
             phy-names = "ufsphy";
@@ -326,19 +210,8 @@ examples:
             vccq-max-microamp = <1200000>;
 
             power-domains = <&gcc UFS_PHY_GDSC>;
-            iommus = <&apps_smmu 0xe0 0x0>;
-            interconnects = <&aggre1_noc MASTER_UFS_MEM &mc_virt SLAVE_EBI1>,
-                            <&gem_noc MASTER_APPSS_PROC &config_noc SLAVE_UFS_MEM_CFG>;
-            interconnect-names = "ufs-ddr", "cpu-ufs";
+            iommus = <&apps_smmu 0x300 0>;
 
-            clock-names = "core_clk",
-                          "bus_aggr_clk",
-                          "iface_clk",
-                          "core_clk_unipro",
-                          "ref_clk",
-                          "tx_lane0_sync_clk",
-                          "rx_lane0_sync_clk",
-                          "rx_lane1_sync_clk";
             clocks = <&gcc GCC_UFS_PHY_AXI_CLK>,
                      <&gcc GCC_AGGRE_UFS_PHY_AXI_CLK>,
                      <&gcc GCC_UFS_PHY_AHB_CLK>,
@@ -346,15 +219,25 @@ examples:
                      <&rpmhcc RPMH_CXO_CLK>,
                      <&gcc GCC_UFS_PHY_TX_SYMBOL_0_CLK>,
                      <&gcc GCC_UFS_PHY_RX_SYMBOL_0_CLK>,
-                     <&gcc GCC_UFS_PHY_RX_SYMBOL_1_CLK>;
-            freq-table-hz = <75000000 300000000>,
+                     <&gcc GCC_UFS_PHY_RX_SYMBOL_1_CLK>,
+                     <&gcc GCC_UFS_PHY_ICE_CORE_CLK>;
+            clock-names = "core_clk",
+                          "bus_aggr_clk",
+                          "iface_clk",
+                          "core_clk_unipro",
+                          "ref_clk",
+                          "tx_lane0_sync_clk",
+                          "rx_lane0_sync_clk",
+                          "rx_lane1_sync_clk",
+                          "ice_core_clk";
+            freq-table-hz = <37500000 300000000>,
+                            <0 0>,
+                            <0 0>,
+                            <37500000 300000000>,
                             <0 0>,
                             <0 0>,
-                            <75000000 300000000>,
-                            <75000000 300000000>,
                             <0 0>,
                             <0 0>,
-                            <0 0>;
-            qcom,ice = <&ice>;
+                            <0 300000000>;
         };
     };
diff --git a/Documentation/devicetree/bindings/ufs/ufs-common.yaml b/Documentation/devicetree/bindings/ufs/ufs-common.yaml
index 31fe7f30ff5b..9f04f34d8c5a 100644
--- a/Documentation/devicetree/bindings/ufs/ufs-common.yaml
+++ b/Documentation/devicetree/bindings/ufs/ufs-common.yaml
@@ -89,6 +89,22 @@ properties:
 
   msi-parent: true
 
+  limit-hs-gear:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    minimum: 1
+    maximum: 6
+    default: 6
+    description:
+      Restricts the maximum HS gear used in both TX and RX directions.
+
+  limit-gear-rate:
+    $ref: /schemas/types.yaml#/definitions/string
+    enum: [rate-a, rate-b]
+    default: rate-b
+    description:
+      Restricts the UFS controller to rate-a or rate-b for both TX and
+      RX directions.
+
 dependencies:
   freq-table-hz: [ clocks ]
   operating-points-v2: [ clocks, clock-names ]
diff --git a/Documentation/driver-api/cxl/devices/device-types.rst b/Documentation/driver-api/cxl/devices/device-types.rst
index 923f5d89bc04..7f69dfa4509b 100644
--- a/Documentation/driver-api/cxl/devices/device-types.rst
+++ b/Documentation/driver-api/cxl/devices/device-types.rst
@@ -22,7 +22,7 @@ The basic interaction protocol, similar to PCIe configuration mechanisms.
 Typically used for initialization, configuration, and I/O access for anything
 other than memory (CXL.mem) or cache (CXL.cache) operations.
 
-The Linux CXL driver exposes access to .io functionalty via the various sysfs
+The Linux CXL driver exposes access to .io functionality via the various sysfs
 interfaces and /dev/cxl/ devices (which exposes direct access to device
 mailboxes).
 
diff --git a/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst b/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst
index aebda0eb3e17..a4c3fb51ea7d 100644
--- a/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst
+++ b/Documentation/driver-api/cxl/platform/example-configurations/one-dev-per-hb.rst
@@ -10,7 +10,7 @@ has a single CXL memory expander with a 4GB of memory.
 Things to note:
 
 * Cross-Bridge interleave is not being used.
-* The expanders are in two separate but adjascent memory regions.
+* The expanders are in two separate but adjacent memory regions.
 * This CEDT/SRAT describes one node per device
 * The expanders have the same performance and will be in the same memory tier.
 
diff --git a/Documentation/driver-api/device-io.rst b/Documentation/driver-api/device-io.rst
index 5c7e8194bef9..d1aaa961cac4 100644
--- a/Documentation/driver-api/device-io.rst
+++ b/Documentation/driver-api/device-io.rst
@@ -16,7 +16,7 @@ Bus-Independent Device Accesses
 Introduction
 ============
 
-Linux provides an API which abstracts performing IO across all busses
+Linux provides an API which abstracts performing IO across all buses
 and devices, allowing device drivers to be written independently of bus
 type.
 
@@ -71,7 +71,7 @@ can be compiler optimised, you can use __readb() and friends to
 indicate the relaxed ordering. Use this with care.
 
 While the basic functions are defined to be synchronous with respect to
-each other and ordered with respect to each other the busses the devices
+each other and ordered with respect to each other the buses the devices
 sit on may themselves have asynchronicity. In particular many authors
 are burned by the fact that PCI bus writes are posted asynchronously. A
 driver author must issue a read from the same device to ensure that
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
index e98d0ab4a9b6..b3f447bf9f07 100644
--- a/Documentation/driver-api/driver-model/overview.rst
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -22,7 +22,7 @@ uniformity across the different bus types.
 
 The current driver model provides a common, uniform data model for describing
 a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
+model includes a set of common attributes which all buses carry, and a set
 of common callbacks, such as device discovery during bus probing, bus
 shutdown, bus power management, etc.
 
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
index 7beb8a9648c5..cf5ff48d3115 100644
--- a/Documentation/driver-api/driver-model/platform.rst
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -4,7 +4,7 @@ Platform Devices and Drivers
 
 See <linux/platform_device.h> for the driver model interface to the
 platform bus:  platform_device, and platform_driver.  This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
+is used to connect devices on buses with minimal infrastructure,
 like those used to integrate peripherals on many system-on-chip
 processors, or some "legacy" PC interconnects; as opposed to large
 formally specified ones like PCI or USB.
diff --git a/Documentation/driver-api/eisa.rst b/Documentation/driver-api/eisa.rst
index b33ebe1ec9ed..3563e5f7e98d 100644
--- a/Documentation/driver-api/eisa.rst
+++ b/Documentation/driver-api/eisa.rst
@@ -8,9 +8,9 @@ This document groups random notes about porting EISA drivers to the
 new EISA/sysfs API.
 
 Starting from version 2.5.59, the EISA bus is almost given the same
-status as other much more mainstream busses such as PCI or USB. This
+status as other much more mainstream buses such as PCI or USB. This
 has been possible through sysfs, which defines a nice enough set of
-abstractions to manage busses, devices and drivers.
+abstractions to manage buses, devices and drivers.
 
 Although the new API is quite simple to use, converting existing
 drivers to the new infrastructure is not an easy task (mostly because
@@ -205,7 +205,7 @@ Random notes
 Converting an EISA driver to the new API mostly involves *deleting*
 code (since probing is now in the core EISA code). Unfortunately, most
 drivers share their probing routine between ISA, and EISA. Special
-care must be taken when ripping out the EISA code, so other busses
+care must be taken when ripping out the EISA code, so other buses
 won't suffer from these surgical strikes...
 
 You *must not* expect any EISA device to be detected when returning
diff --git a/Documentation/driver-api/i3c/protocol.rst b/Documentation/driver-api/i3c/protocol.rst
index 23a0b93c62b1..fe338f8085db 100644
--- a/Documentation/driver-api/i3c/protocol.rst
+++ b/Documentation/driver-api/i3c/protocol.rst
@@ -165,8 +165,8 @@ The first thing attached to an HDR command is the HDR mode. There are currently
 for more details):
 
 * HDR-DDR: Double Data Rate mode
-* HDR-TSP: Ternary Symbol Pure. Only usable on busses with no I2C devices
-* HDR-TSL: Ternary Symbol Legacy. Usable on busses with I2C devices
+* HDR-TSP: Ternary Symbol Pure. Only usable on buses with no I2C devices
+* HDR-TSL: Ternary Symbol Legacy. Usable on buses with I2C devices
 
 When sending an HDR command, the whole bus has to enter HDR mode, which is done
 using a broadcast CCC command.
diff --git a/Documentation/driver-api/ipmi.rst b/Documentation/driver-api/ipmi.rst
index 2cc6c898ab90..f52ab2df2569 100644
--- a/Documentation/driver-api/ipmi.rst
+++ b/Documentation/driver-api/ipmi.rst
@@ -617,12 +617,12 @@ Note that the address you give here is the I2C address, not the IPMI
 address.  So if you want your MC address to be 0x60, you put 0x30
 here.  See the I2C driver info for more details.
 
-Command bridging to other IPMB busses through this interface does not
+Command bridging to other IPMB buses through this interface does not
 work.  The receive message queue is not implemented, by design.  There
 is only one receive message queue on a BMC, and that is meant for the
 host drivers, not something on the IPMB bus.
 
-A BMC may have multiple IPMB busses, which bus your device sits on
+A BMC may have multiple IPMB buses, which bus your device sits on
 depends on how the system is wired.  You can fetch the channels with
 "ipmitool channel info <n>" where <n> is the channel, with the
 channels being 0-7 and try the IPMB channels.
diff --git a/Documentation/driver-api/media/tx-rx.rst b/Documentation/driver-api/media/tx-rx.rst
index 0b8c9cde8ee4..22e1b13ecde9 100644
--- a/Documentation/driver-api/media/tx-rx.rst
+++ b/Documentation/driver-api/media/tx-rx.rst
@@ -12,7 +12,7 @@ CSI-2 receiver in an SoC.
 Bus types
 ---------
 
-The following busses are the most common. This section discusses these two only.
+The following buses are the most common. This section discusses these two only.
 
 MIPI CSI-2
 ^^^^^^^^^^
@@ -36,7 +36,7 @@ Transmitter drivers
 
 Transmitter drivers generally need to provide the receiver drivers with the
 configuration of the transmitter. What is required depends on the type of the
-bus. These are common for both busses.
+bus. These are common for both buses.
 
 Media bus pixel code
 ^^^^^^^^^^^^^^^^^^^^
diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst
index c205efa4d45b..959ba1cc0263 100644
--- a/Documentation/driver-api/nvdimm/nvdimm.rst
+++ b/Documentation/driver-api/nvdimm/nvdimm.rst
@@ -230,7 +230,7 @@ LIBNVDIMM/LIBNDCTL: Bus
 A bus has a 1:1 relationship with an NFIT.  The current expectation for
 ACPI based systems is that there is only ever one platform-global NFIT.
 That said, it is trivial to register multiple NFITs, the specification
-does not preclude it.  The infrastructure supports multiple busses and
+does not preclude it.  The infrastructure supports multiple buses and
 we use this capability to test multiple NFIT configurations in the unit
 test.
 
diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst
index 8208924e513e..afc6ddd80fa1 100644
--- a/Documentation/driver-api/pin-control.rst
+++ b/Documentation/driver-api/pin-control.rst
@@ -1202,22 +1202,24 @@ default state like this:
 	{
 		/* Allocate a state holder named "foo" etc */
 		struct foo_state *foo = ...;
+		int ret;
 
 		foo->p = devm_pinctrl_get(&device);
 		if (IS_ERR(foo->p)) {
-			/* FIXME: clean up "foo" here */
-			return PTR_ERR(foo->p);
+			ret = PTR_ERR(foo->p);
+			foo->p = NULL;
+			return ret;
 		}
 
 		foo->s = pinctrl_lookup_state(foo->p, PINCTRL_STATE_DEFAULT);
 		if (IS_ERR(foo->s)) {
-			/* FIXME: clean up "foo" here */
+			devm_pinctrl_put(foo->p);
 			return PTR_ERR(foo->s);
 		}
 
 		ret = pinctrl_select_state(foo->p, foo->s);
 		if (ret < 0) {
-			/* FIXME: clean up "foo" here */
+			devm_pinctrl_put(foo->p);
 			return ret;
 		}
 	}
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 8d86d5da4023..36d5c9c9fd11 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -255,7 +255,7 @@ get registered:  a child can never be registered, probed or resumed before
 its parent; and can't be removed or suspended after that parent.
 
 The policy is that the device hierarchy should match hardware bus topology.
-[Or at least the control bus, for devices which use multiple busses.]
+[Or at least the control bus, for devices which use multiple buses.]
 In particular, this means that a device registration may fail if the parent of
 the device is suspending (i.e. has been chosen by the PM core as the next
 device to suspend) or has already suspended, as well as after all of the other
@@ -493,7 +493,7 @@ states, like S3).
 
 Drivers must also be prepared to notice that the device has been removed
 while the system was powered down, whenever that's physically possible.
-PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of busses
+PCMCIA, MMC, USB, Firewire, SCSI, and even IDE are common examples of buses
 where common Linux platforms will see such removal.  Details of how drivers
 will notice and handle such removals are currently bus-specific, and often
 involve a separate thread.
diff --git a/Documentation/driver-api/scsi.rst b/Documentation/driver-api/scsi.rst
index bf2be96cc2d6..8bbdfb018c53 100644
--- a/Documentation/driver-api/scsi.rst
+++ b/Documentation/driver-api/scsi.rst
@@ -18,7 +18,7 @@ optical drives, test equipment, and medical devices) to a host computer.
 
 Although the old parallel (fast/wide/ultra) SCSI bus has largely fallen
 out of use, the SCSI command set is more widely used than ever to
-communicate with devices over a number of different busses.
+communicate with devices over a number of different buses.
 
 The `SCSI protocol <https://www.t10.org/scsi-3.htm>`__ is a big-endian
 peer-to-peer packet based protocol. SCSI commands are 6, 10, 12, or 16
@@ -286,7 +286,7 @@ Parallel SCSI (SPI) transport class
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The file drivers/scsi/scsi_transport_spi.c defines transport
-attributes for traditional (fast/wide/ultra) SCSI busses.
+attributes for traditional (fast/wide/ultra) SCSI buses.
 
 .. kernel-doc:: drivers/scsi/scsi_transport_spi.c
    :export:
diff --git a/Documentation/driver-api/spi.rst b/Documentation/driver-api/spi.rst
index f28887045049..74eca6735042 100644
--- a/Documentation/driver-api/spi.rst
+++ b/Documentation/driver-api/spi.rst
@@ -13,7 +13,7 @@ additional chipselect line is usually active-low (nCS); four signals are
 normally used for each peripheral, plus sometimes an interrupt.
 
 The SPI bus facilities listed here provide a generalized interface to
-declare SPI busses and devices, manage them according to the standard
+declare SPI buses and devices, manage them according to the standard
 Linux driver model, and perform input/output operations. At this time,
 only "master" side interfaces are supported, where Linux talks to SPI
 peripherals and does not implement such a peripheral itself. (Interfaces
diff --git a/Documentation/driver-api/thermal/exynos_thermal_emulation.rst b/Documentation/driver-api/thermal/exynos_thermal_emulation.rst
index c21d10838bc5..c679502f01c7 100644
--- a/Documentation/driver-api/thermal/exynos_thermal_emulation.rst
+++ b/Documentation/driver-api/thermal/exynos_thermal_emulation.rst
@@ -28,13 +28,13 @@ changed into it.
 delay of changing temperature. However, this node only uses same delay
 of real sensing time, 938us.)
 
-Exynos emulation mode requires synchronous of value changing and
-enabling. It means when you want to update the any value of delay or
-next temperature, then you have to enable emulation mode at the same
-time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value
-repeatedly. That's why this node gives users the right to change
-termerpature only. Just one interface makes it more simply to use.
+Exynos emulation mode requires that value changes and enabling are performed
+synchronously. This means that when you want to update any value, such as the
+delay or the next temperature, you must enable emulation mode at the same
+time (or keep the mode enabled). If you do not, the value will fail to update
+and the last successful value will continue to be used. For this reason,
+this node only allows users to change the temperature. Providing a single
+interface makes it simpler to use.
 
 Disabling emulation mode only requires writing value 0 to sysfs node.
 
diff --git a/Documentation/driver-api/usb/hotplug.rst b/Documentation/driver-api/usb/hotplug.rst
index c1e13107c50e..12260f704a01 100644
--- a/Documentation/driver-api/usb/hotplug.rst
+++ b/Documentation/driver-api/usb/hotplug.rst
@@ -5,7 +5,7 @@ Linux Hotplugging
 =================
 
 
-In hotpluggable busses like USB (and Cardbus PCI), end-users plug devices
+In hotpluggable buses like USB (and Cardbus PCI), end-users plug devices
 into the bus with power on.  In most cases, users expect the devices to become
 immediately usable.  That means the system must do many things, including:
 
diff --git a/Documentation/driver-api/usb/index.rst b/Documentation/driver-api/usb/index.rst
index cfa8797ea614..fcb24d0500d9 100644
--- a/Documentation/driver-api/usb/index.rst
+++ b/Documentation/driver-api/usb/index.rst
@@ -3,6 +3,7 @@ Linux USB API
 =============
 
 .. toctree::
+   :maxdepth: 1
 
    usb
    gadget
diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst
index 976fb4221062..7f2f41e80c1c 100644
--- a/Documentation/driver-api/usb/usb.rst
+++ b/Documentation/driver-api/usb/usb.rst
@@ -13,7 +13,7 @@ structure, with the host as the root (the system's master), hubs as
 interior nodes, and peripherals as leaves (and slaves). Modern PCs
 support several such trees of USB devices, usually
 a few USB 3.0 (5 GBit/s) or USB 3.1 (10 GBit/s) and some legacy
-USB 2.0 (480 MBit/s) busses just in case.
+USB 2.0 (480 MBit/s) buses just in case.
 
 That master/slave asymmetry was designed-in for a number of reasons, one
 being ease of use. It is not physically possible to mistake upstream and
@@ -42,7 +42,7 @@ two. One is intended for *general-purpose* drivers (exposed through
 driver frameworks), and the other is for drivers that are *part of the
 core*. Such core drivers include the *hub* driver (which manages trees
 of USB devices) and several different kinds of *host controller
-drivers*, which control individual busses.
+drivers*, which control individual buses.
 
 The device model seen by USB drivers is relatively complex.
 
diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst
index 212f7003cfba..a98a5cb0b0d8 100644
--- a/Documentation/fb/fbcon.rst
+++ b/Documentation/fb/fbcon.rst
@@ -39,11 +39,13 @@ Also, you will need to select at least one compiled-in font, but if
 you don't do anything, the kernel configuration tool will select one for you,
 usually an 8x16 font.
 
-GOTCHA: A common bug report is enabling the framebuffer without enabling the
-framebuffer console.  Depending on the driver, you may get a blanked or
-garbled display, but the system still boots to completion.  If you are
-fortunate to have a driver that does not alter the graphics chip, then you
-will still get a VGA console.
+.. admonition:: GOTCHA
+
+   A common bug report is enabling the framebuffer without enabling the
+   framebuffer console.  Depending on the driver, you may get a blanked or
+   garbled display, but the system still boots to completion.  If you are
+   fortunate to have a driver that does not alter the graphics chip, then you
+   will still get a VGA console.
 
 B. Loading
 ==========
@@ -74,6 +76,7 @@ Possible scenarios:
 	 over the console.
 
 C. Boot options
+===============
 
 	 The framebuffer console has several, largely unknown, boot options
 	 that can change its behavior.
@@ -116,9 +119,10 @@ C. Boot options
 	outside the given range will still be controlled by the standard
 	console driver.
 
-	NOTE: For x86 machines, the standard console is the VGA console which
-	is typically located on the same video card.  Thus, the consoles that
-	are controlled by the VGA console will be garbled.
+	.. note::
+	   For x86 machines, the standard console is the VGA console which
+	   is typically located on the same video card.  Thus, the consoles that
+	   are controlled by the VGA console will be garbled.
 
 4. fbcon=rotate:<n>
 
@@ -140,10 +144,11 @@ C. Boot options
 	Console rotation will only become available if Framebuffer Console
 	Rotation support is compiled in your kernel.
 
-	NOTE: This is purely console rotation.  Any other applications that
-	use the framebuffer will remain at their 'normal' orientation.
-	Actually, the underlying fb driver is totally ignorant of console
-	rotation.
+	.. note::
+	   This is purely console rotation.  Any other applications that
+	   use the framebuffer will remain at their 'normal' orientation.
+	   Actually, the underlying fb driver is totally ignorant of console
+	   rotation.
 
 5. fbcon=margin:<color>
 
@@ -172,7 +177,8 @@ C. Boot options
 	The value 'n' overrides the number of bootup logos. 0 disables the
 	logo, and -1 gives the default which is the number of online CPUs.
 
-C. Attaching, Detaching and Unloading
+D. Attaching, Detaching and Unloading
+=====================================
 
 Before going on to how to attach, detach and unload the framebuffer console, an
 illustration of the dependencies may help.
@@ -249,11 +255,11 @@ restored properly. The following is one of the several methods that you can do:
        echo 1 > /sys/class/vtconsole/vtcon1/bind
 
 8. Once fbcon is unbound, all drivers registered to the system will also
-become unbound.  This means that fbcon and individual framebuffer drivers
-can be unloaded or reloaded at will. Reloading the drivers or fbcon will
-automatically bind the console, fbcon and the drivers together. Unloading
-all the drivers without unloading fbcon will make it impossible for the
-console to bind fbcon.
+   become unbound.  This means that fbcon and individual framebuffer drivers
+   can be unloaded or reloaded at will. Reloading the drivers or fbcon will
+   automatically bind the console, fbcon and the drivers together. Unloading
+   all the drivers without unloading fbcon will make it impossible for the
+   console to bind fbcon.
 
 Notes for vesafb users:
 =======================
diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt
index 7434b43c2ff8..83f77f55fc87 100644
--- a/Documentation/features/core/eBPF-JIT/arch-support.txt
+++ b/Documentation/features/core/eBPF-JIT/arch-support.txt
@@ -7,7 +7,7 @@
     |         arch |status|
     -----------------------
     |       alpha: | TODO |
-    |         arc: | TODO |
+    |         arc: |  ok  |
     |         arm: |  ok  |
     |       arm64: |  ok  |
     |        csky: | TODO |
@@ -18,7 +18,7 @@
     |        mips: |  ok  |
     |       nios2: | TODO |
     |    openrisc: | TODO |
-    |      parisc: | TODO |
+    |      parisc: |  ok  |
     |     powerpc: |  ok  |
     |       riscv: |  ok  |
     |        s390: |  ok  |
diff --git a/Documentation/features/core/mseal_sys_mappings/arch-support.txt b/Documentation/features/core/mseal_sys_mappings/arch-support.txt
index a3c24233eb9b..fa85381acc43 100644
--- a/Documentation/features/core/mseal_sys_mappings/arch-support.txt
+++ b/Documentation/features/core/mseal_sys_mappings/arch-support.txt
@@ -20,7 +20,7 @@
     |    openrisc: |  N/A |
     |      parisc: | TODO |
     |     powerpc: | TODO |
-    |       riscv: | TODO |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: |  N/A |
     |       sparc: | TODO |
diff --git a/Documentation/features/core/thread-info-in-task/arch-support.txt b/Documentation/features/core/thread-info-in-task/arch-support.txt
index 2afeb6bf6e64..f3d744c76061 100644
--- a/Documentation/features/core/thread-info-in-task/arch-support.txt
+++ b/Documentation/features/core/thread-info-in-task/arch-support.txt
@@ -24,7 +24,7 @@
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
-    |          um: | TODO |
+    |          um: |  ok  |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index a72330e25542..4f36fcbfb6d5 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -24,7 +24,7 @@
     |        s390: |  ok  |
     |          sh: |  ok  |
     |       sparc: |  ok  |
-    |          um: | TODO |
+    |          um: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt
index 713a69fcd697..75c05d348c01 100644
--- a/Documentation/features/perf/kprobes-event/arch-support.txt
+++ b/Documentation/features/perf/kprobes-event/arch-support.txt
@@ -17,7 +17,7 @@
     |  microblaze: | TODO |
     |        mips: |  ok  |
     |       nios2: | TODO |
-    |    openrisc: | TODO |
+    |    openrisc: |  ok  |
     |      parisc: |  ok  |
     |     powerpc: |  ok  |
     |       riscv: |  ok  |
diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt
index 4d4bfac52970..d6100b226de5 100644
--- a/Documentation/features/time/clockevents/arch-support.txt
+++ b/Documentation/features/time/clockevents/arch-support.txt
@@ -18,7 +18,7 @@
     |        mips: |  ok  |
     |       nios2: |  ok  |
     |    openrisc: |  ok  |
-    |      parisc: | TODO |
+    |      parisc: |  ok  |
     |     powerpc: |  ok  |
     |       riscv: |  ok  |
     |        s390: |  ok  |
diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index 7ddb235aee9d..08194f194b94 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -116,7 +116,7 @@ cache_strategy=%s      Select a strategy for cached decompression from now on:
                                    cluster for further reading. It still does
                                    in-place I/O decompression for the rest
                                    compressed physical clusters;
-                       readaround  Cache the both ends of incomplete compressed
+                       readaround  Cache both ends of incomplete compressed
                                    physical clusters for further reading.
                                    It still does in-place I/O decompression
                                    for the rest compressed physical clusters.
diff --git a/Documentation/filesystems/ext4/atomic_writes.rst b/Documentation/filesystems/ext4/atomic_writes.rst
index aeb47ace738d..ae8995740aa8 100644
--- a/Documentation/filesystems/ext4/atomic_writes.rst
+++ b/Documentation/filesystems/ext4/atomic_writes.rst
@@ -14,7 +14,7 @@ I/O) on regular files with extents, provided the underlying storage device
 supports hardware atomic writes. This is supported in the following two ways:
 
 1. **Single-fsblock Atomic Writes**:
-   EXT4's supports atomic write operations with a single filesystem block since
+   EXT4 supports atomic write operations with a single filesystem block since
    v6.13. In this the atomic write unit minimum and maximum sizes are both set
    to filesystem blocksize.
    e.g. doing atomic write of 16KB with 16KB filesystem blocksize on 64KB
@@ -50,7 +50,7 @@ Multi-fsblock Implementation Details
 
 The bigalloc feature changes ext4 to allocate in units of multiple filesystem
 blocks, also known as clusters. With bigalloc each bit within block bitmap
-represents cluster (power of 2 number of blocks) rather than individual
+represents a cluster (power of 2 number of blocks) rather than individual
 filesystem blocks.
 EXT4 supports multi-fsblock atomic writes with bigalloc, subject to the
 following constraints. The minimum atomic write size is the larger of the fs
@@ -189,7 +189,7 @@ The write must be aligned to the filesystem's block size and not exceed the
 filesystem's maximum atomic write unit size.
 See ``generic_atomic_write_valid()`` for more details.
 
-``statx()`` system call with ``STATX_WRITE_ATOMIC`` flag can provides following
+``statx()`` system call with ``STATX_WRITE_ATOMIC`` flag can provide following
 details:
 
  * ``stx_atomic_write_unit_min``: Minimum size of an atomic write request.
diff --git a/Documentation/filesystems/gfs2-glocks.rst b/Documentation/filesystems/gfs2-glocks.rst
index adc0d4c4d979..ce5ff08cbd59 100644
--- a/Documentation/filesystems/gfs2-glocks.rst
+++ b/Documentation/filesystems/gfs2-glocks.rst
@@ -105,7 +105,7 @@ go_unlocked           Yes                       No
    Operations must not drop either the bit lock or the spinlock
    if its held on entry. go_dump and do_demote_ok must never block.
    Note that go_dump will only be called if the glock's state
-   indicates that it is caching uptodate data.
+   indicates that it is caching up-to-date data.
 
 Glock locking order within GFS2:
 
diff --git a/Documentation/filesystems/hpfs.rst b/Documentation/filesystems/hpfs.rst
index 7e0dd2f4373e..0f9516b5eb07 100644
--- a/Documentation/filesystems/hpfs.rst
+++ b/Documentation/filesystems/hpfs.rst
@@ -65,7 +65,7 @@ are case sensitive, so for example when you create a file FOO, you can use
 'cat FOO', 'cat Foo', 'cat foo' or 'cat F*' but not 'cat f*'. Note, that you
 also won't be able to compile linux kernel (and maybe other things) on HPFS
 because kernel creates different files with names like bootsect.S and
-bootsect.s. When searching for file thats name has characters >= 128, codepages
+bootsect.s. When searching for file whose name has characters >= 128, codepages
 are used - see below.
 OS/2 ignores dots and spaces at the end of file name, so this driver does as
 well. If you create 'a. ...', the file 'a' will be created, but you can still
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 067ed8e14ef3..387fd9cc72ca 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -321,7 +321,7 @@ The fields are as follows:
   - ``writeback_submit``: Submit the previous built writeback context.
     Block based file systems should use the iomap_ioend_writeback_submit
     helper, other file system can implement their own.
-    File systems can optionall to hook into writeback bio submission.
+    File systems can optionally hook into writeback bio submission.
     This might include pre-write space accounting updates, or installing
     a custom ``->bi_end_io`` function for internal purposes, such as
     deferring the ioend completion to a workqueue to run metadata update
diff --git a/Documentation/filesystems/ocfs2-online-filecheck.rst b/Documentation/filesystems/ocfs2-online-filecheck.rst
index 2257bb53edc1..9e8449416e0b 100644
--- a/Documentation/filesystems/ocfs2-online-filecheck.rst
+++ b/Documentation/filesystems/ocfs2-online-filecheck.rst
@@ -58,33 +58,33 @@ inode, fixing inode and setting the size of result record history.
     # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/check
     # cat /sys/fs/ocfs2/<devname>/filecheck/check
 
-The output is like this::
+   The output is like this::
 
     INO		DONE	ERROR
     39502		1	GENERATION
 
-    <INO> lists the inode numbers.
-    <DONE> indicates whether the operation has been finished.
-    <ERROR> says what kind of errors was found. For the detailed error numbers,
-    please refer to the file linux/fs/ocfs2/filecheck.h.
+   <INO> lists the inode numbers.
+   <DONE> indicates whether the operation has been finished.
+   <ERROR> says what kind of errors was found. For the detailed error numbers,
+   please refer to the file linux/fs/ocfs2/filecheck.h.
 
 2. If you determine to fix this inode, do::
 
     # echo "<inode>" > /sys/fs/ocfs2/<devname>/filecheck/fix
     # cat /sys/fs/ocfs2/<devname>/filecheck/fix
 
-The output is like this:::
+   The output is like this::
 
     INO		DONE	ERROR
     39502		1	SUCCESS
 
-This time, the <ERROR> column indicates whether this fix is successful or not.
+   This time, the <ERROR> column indicates whether this fix is successful or not.
 
 3. The record cache is used to store the history of check/fix results. It's
-default size is 10, and can be adjust between the range of 10 ~ 100. You can
-adjust the size like this::
+   default size is 10, and can be adjust between the range of 10 ~ 100. You can
+   adjust the size like this::
 
-  # echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set
+    # echo "<size>" > /sys/fs/ocfs2/<devname>/filecheck/set
 
 Fixing stuff
 ============
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 42f2fb9e3c8f..3002258c9c7f 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -61,19 +61,6 @@ Preface
 0.1 Introduction/Credits
 ------------------------
 
-This documentation is  part of a soon (or  so we hope) to be  released book on
-the SuSE  Linux distribution. As  there is  no complete documentation  for the
-/proc file system and we've used  many freely available sources to write these
-chapters, it  seems only fair  to give the work  back to the  Linux community.
-This work is  based on the 2.2.*  kernel version and the  upcoming 2.4.*. I'm
-afraid it's still far from complete, but we  hope it will be useful. As far as
-we know, it is the first 'all-in-one' document about the /proc file system. It
-is focused  on the Intel  x86 hardware,  so if you  are looking for  PPC, ARM,
-SPARC, AXP, etc., features, you probably  won't find what you are looking for.
-It also only covers IPv4 networking, not IPv6 nor other protocols - sorry. But
-additions and patches  are welcome and will  be added to this  document if you
-mail them to Bodo.
-
 We'd like  to  thank Alan Cox, Rik van Riel, and Alexey Kuznetsov and a lot of
 other people for help compiling this documentation. We'd also like to extend a
 special thank  you to Andi Kleen for documentation, which we relied on heavily
@@ -81,17 +68,9 @@ to create  this  document,  as well as the additional information he provided.
 Thanks to  everybody  else  who contributed source or docs to the Linux kernel
 and helped create a great piece of software... :)
 
-If you  have  any comments, corrections or additions, please don't hesitate to
-contact Bodo  Bauer  at  bb@ricochet.net.  We'll  be happy to add them to this
-document.
-
 The   latest   version    of   this   document   is    available   online   at
 https://www.kernel.org/doc/html/latest/filesystems/proc.html
 
-If  the above  direction does  not works  for you,  you could  try the  kernel
-mailing  list  at  linux-kernel@vger.kernel.org  and/or try  to  reach  me  at
-comandante@zaralinux.com.
-
 0.2 Legal Stuff
 ---------------
 
diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
index c90349e5b889..9a7eb96df300 100644
--- a/Documentation/filesystems/propagate_umount.txt
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -286,7 +286,7 @@ Trim_one(m)
 		strip the "seen by Trim_ancestors" mark from m
 		remove m from the Candidates list
 		return
-		
+
 	remove_this = false
 	found = false
 	for each n in children(m)
@@ -312,7 +312,7 @@ Trim_ancestors(m)
 	}
 
 	Terminating condition in the loop in Trim_ancestors() is correct,
-since that that loop will never run into p belonging to U - p is always
+since that loop will never run into p belonging to U - p is always
 an ancestor of argument of Trim_one() and since U is closed, the argument
 of Trim_one() would also have to belong to U.  But Trim_one() is never
 called for elements of U.  In other words, p belongs to S if and only
@@ -361,7 +361,7 @@ such removals.
 Proof: suppose S was non-shifting, x is a locked element of S, parent of x
 is not in S and S - {x} is not non-shifting.  Then there is an element m
 in S - {x} and a subtree mounted strictly inside m, such that m contains
-an element not in in S - {x}.  Since S is non-shifting, everything in
+an element not in S - {x}.  Since S is non-shifting, everything in
 that subtree must belong to S.  But that means that this subtree must
 contain x somewhere *and* that parent of x either belongs that subtree
 or is equal to m.  Either way it must belong to S.  Contradiction.
diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst
index 006d23af66e1..b7f35b07876a 100644
--- a/Documentation/filesystems/resctrl.rst
+++ b/Documentation/filesystems/resctrl.rst
@@ -769,7 +769,7 @@ this would be dependent on number of cores the benchmark is run on.
    depending on # of threads:
 
 For the same SKU in #1, a 'single thread, with 10% bandwidth' and '4
-thread, with 10% bandwidth' can consume upto 10GBps and 40GBps although
+thread, with 10% bandwidth' can consume up to 10GBps and 40GBps although
 they have same percentage bandwidth of 10%. This is simply because as
 threads start using more cores in an rdtgroup, the actual bandwidth may
 increase or vary although user specified bandwidth percentage is same.
diff --git a/Documentation/filesystems/sharedsubtree.rst b/Documentation/filesystems/sharedsubtree.rst
index 1cf56489ed48..8b7dc9159083 100644
--- a/Documentation/filesystems/sharedsubtree.rst
+++ b/Documentation/filesystems/sharedsubtree.rst
@@ -31,965 +31,960 @@ and versioned filesystem.
 -----------
 
 Shared subtree provides four different flavors of mounts; struct vfsmount to be
-precise
+precise:
 
-	a. shared mount
-	b. slave mount
-	c. private mount
-	d. unbindable mount
 
+a) A **shared mount** can be replicated to as many mountpoints and all the
+   replicas continue to be exactly same.
 
-2a) A shared mount can be replicated to as many mountpoints and all the
-replicas continue to be exactly same.
+   Here is an example:
 
-	Here is an example:
+   Let's say /mnt has a mount that is shared::
 
-	Let's say /mnt has a mount that is shared::
+     # mount --make-shared /mnt
 
-	    mount --make-shared /mnt
+   .. note::
+      mount(8) command now supports the --make-shared flag,
+      so the sample 'smount' program is no longer needed and has been
+      removed.
 
-	Note: mount(8) command now supports the --make-shared flag,
-	so the sample 'smount' program is no longer needed and has been
-	removed.
+   ::
 
-	::
+     # mount --bind /mnt /tmp
 
-	    # mount --bind /mnt /tmp
+   The above command replicates the mount at /mnt to the mountpoint /tmp
+   and the contents of both the mounts remain identical.
 
-	The above command replicates the mount at /mnt to the mountpoint /tmp
-	and the contents of both the mounts remain identical.
+   ::
 
-	::
+     #ls /mnt
+     a b c
 
-	    #ls /mnt
-	    a b c
+     #ls /tmp
+     a b c
 
-	    #ls /tmp
-	    a b c
+   Now let's say we mount a device at /tmp/a::
 
-	Now let's say we mount a device at /tmp/a::
+     # mount /dev/sd0  /tmp/a
 
-	    # mount /dev/sd0  /tmp/a
+     # ls /tmp/a
+     t1 t2 t3
 
-	    #ls /tmp/a
-	    t1 t2 t3
+     # ls /mnt/a
+     t1 t2 t3
 
-	    #ls /mnt/a
-	    t1 t2 t3
+   Note that the mount has propagated to the mount at /mnt as well.
 
-	Note that the mount has propagated to the mount at /mnt as well.
+   And the same is true even when /dev/sd0 is mounted on /mnt/a. The
+   contents will be visible under /tmp/a too.
 
-	And the same is true even when /dev/sd0 is mounted on /mnt/a. The
-	contents will be visible under /tmp/a too.
 
+b) A **slave mount** is like a shared mount except that mount and umount events
+   only propagate towards it.
 
-2b) A slave mount is like a shared mount except that mount and umount events
-	only propagate towards it.
+   All slave mounts have a master mount which is a shared.
 
-	All slave mounts have a master mount which is a shared.
+   Here is an example:
 
-	Here is an example:
+   Let's say /mnt has a mount which is shared::
 
-	Let's say /mnt has a mount which is shared.
-	# mount --make-shared /mnt
+     # mount --make-shared /mnt
 
-	Let's bind mount /mnt to /tmp
-	# mount --bind /mnt /tmp
+   Let's bind mount /mnt to /tmp::
 
-	the new mount at /tmp becomes a shared mount and it is a replica of
-	the mount at /mnt.
+     # mount --bind /mnt /tmp
 
-	Now let's make the mount at /tmp; a slave of /mnt
-	# mount --make-slave /tmp
+   the new mount at /tmp becomes a shared mount and it is a replica of
+   the mount at /mnt.
 
-	let's mount /dev/sd0 on /mnt/a
-	# mount /dev/sd0 /mnt/a
+   Now let's make the mount at /tmp; a slave of /mnt::
 
-	#ls /mnt/a
-	t1 t2 t3
+     # mount --make-slave /tmp
 
-	#ls /tmp/a
-	t1 t2 t3
+   let's mount /dev/sd0 on /mnt/a::
 
-	Note the mount event has propagated to the mount at /tmp
+     # mount /dev/sd0 /mnt/a
 
-	However let's see what happens if we mount something on the mount at /tmp
+     # ls /mnt/a
+     t1 t2 t3
 
-	# mount /dev/sd1 /tmp/b
+     # ls /tmp/a
+     t1 t2 t3
 
-	#ls /tmp/b
-	s1 s2 s3
+   Note the mount event has propagated to the mount at /tmp
 
-	#ls /mnt/b
+   However let's see what happens if we mount something on the mount at
+   /tmp::
 
-	Note how the mount event has not propagated to the mount at
-	/mnt
+     # mount /dev/sd1 /tmp/b
 
+     # ls /tmp/b
+     s1 s2 s3
 
-2c) A private mount does not forward or receive propagation.
+     # ls /mnt/b
 
-	This is the mount we are familiar with. Its the default type.
+   Note how the mount event has not propagated to the mount at
+   /mnt
 
 
-2d) A unbindable mount is a unbindable private mount
+c) A **private mount** does not forward or receive propagation.
 
-	let's say we have a mount at /mnt and we make it unbindable::
+   This is the mount we are familiar with. Its the default type.
 
-	    # mount --make-unbindable /mnt
 
-	 Let's try to bind mount this mount somewhere else::
+d) An **unbindable mount** is, as the name suggests, an unbindable private
+   mount.
 
-	    # mount --bind /mnt /tmp
-	    mount: wrong fs type, bad option, bad superblock on /mnt,
-		    or too many mounted file systems
+   let's say we have a mount at /mnt and we make it unbindable::
 
-	Binding a unbindable mount is a invalid operation.
+     # mount --make-unbindable /mnt
+
+   Let's try to bind mount this mount somewhere else::
+
+     # mount --bind /mnt /tmp mount: wrong fs type, bad option, bad
+     superblock on /mnt, or too many mounted file systems
+
+   Binding a unbindable mount is a invalid operation.
 
 
 3) Setting mount states
 -----------------------
 
-	The mount command (util-linux package) can be used to set mount
-	states::
+The mount command (util-linux package) can be used to set mount
+states::
 
-	    mount --make-shared mountpoint
-	    mount --make-slave mountpoint
-	    mount --make-private mountpoint
-	    mount --make-unbindable mountpoint
+    mount --make-shared mountpoint
+    mount --make-slave mountpoint
+    mount --make-private mountpoint
+    mount --make-unbindable mountpoint
 
 
 4) Use cases
 ------------
 
-	A) A process wants to clone its own namespace, but still wants to
-	   access the CD that got mounted recently.
+A) A process wants to clone its own namespace, but still wants to
+   access the CD that got mounted recently.
 
-	   Solution:
+   Solution:
 
-		The system administrator can make the mount at /cdrom shared::
+   The system administrator can make the mount at /cdrom shared::
 
-		    mount --bind /cdrom /cdrom
-		    mount --make-shared /cdrom
+     mount --bind /cdrom /cdrom
+     mount --make-shared /cdrom
 
-		Now any process that clones off a new namespace will have a
-		mount at /cdrom which is a replica of the same mount in the
-		parent namespace.
+   Now any process that clones off a new namespace will have a
+   mount at /cdrom which is a replica of the same mount in the
+   parent namespace.
 
-		So when a CD is inserted and mounted at /cdrom that mount gets
-		propagated to the other mount at /cdrom in all the other clone
-		namespaces.
+   So when a CD is inserted and mounted at /cdrom that mount gets
+   propagated to the other mount at /cdrom in all the other clone
+   namespaces.
 
-	B) A process wants its mounts invisible to any other process, but
-	still be able to see the other system mounts.
+B) A process wants its mounts invisible to any other process, but
+   still be able to see the other system mounts.
 
-	   Solution:
+   Solution:
 
-		To begin with, the administrator can mark the entire mount tree
-		as shareable::
+   To begin with, the administrator can mark the entire mount tree
+   as shareable::
 
-		    mount --make-rshared /
+     mount --make-rshared /
 
-		A new process can clone off a new namespace. And mark some part
-		of its namespace as slave::
+   A new process can clone off a new namespace. And mark some part
+   of its namespace as slave::
 
-		    mount --make-rslave /myprivatetree
+     mount --make-rslave /myprivatetree
 
-		Hence forth any mounts within the /myprivatetree done by the
-		process will not show up in any other namespace. However mounts
-		done in the parent namespace under /myprivatetree still shows
-		up in the process's namespace.
+   Hence forth any mounts within the /myprivatetree done by the
+   process will not show up in any other namespace. However mounts
+   done in the parent namespace under /myprivatetree still shows
+   up in the process's namespace.
 
 
-	Apart from the above semantics this feature provides the
-	building blocks to solve the following problems:
+Apart from the above semantics this feature provides the
+building blocks to solve the following problems:
 
-	C)  Per-user namespace
+C)  Per-user namespace
 
-		The above semantics allows a way to share mounts across
-		namespaces.  But namespaces are associated with processes. If
-		namespaces are made first class objects with user API to
-		associate/disassociate a namespace with userid, then each user
-		could have his/her own namespace and tailor it to his/her
-		requirements. This needs to be supported in PAM.
+    The above semantics allows a way to share mounts across
+    namespaces.  But namespaces are associated with processes. If
+    namespaces are made first class objects with user API to
+    associate/disassociate a namespace with userid, then each user
+    could have his/her own namespace and tailor it to his/her
+    requirements. This needs to be supported in PAM.
 
-	D)  Versioned files
+D)  Versioned files
 
-		If the entire mount tree is visible at multiple locations, then
-		an underlying versioning file system can return different
-		versions of the file depending on the path used to access that
-		file.
+    If the entire mount tree is visible at multiple locations, then
+    an underlying versioning file system can return different
+    versions of the file depending on the path used to access that
+    file.
 
-		An example is::
+    An example is::
 
-		    mount --make-shared /
-		    mount --rbind / /view/v1
-		    mount --rbind / /view/v2
-		    mount --rbind / /view/v3
-		    mount --rbind / /view/v4
+       mount --make-shared /
+       mount --rbind / /view/v1
+       mount --rbind / /view/v2
+       mount --rbind / /view/v3
+       mount --rbind / /view/v4
 
-		and if /usr has a versioning filesystem mounted, then that
-		mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
-		/view/v4/usr too
+    and if /usr has a versioning filesystem mounted, then that
+    mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
+    /view/v4/usr too
 
-		A user can request v3 version of the file /usr/fs/namespace.c
-		by accessing /view/v3/usr/fs/namespace.c . The underlying
-		versioning filesystem can then decipher that v3 version of the
-		filesystem is being requested and return the corresponding
-		inode.
+    A user can request v3 version of the file /usr/fs/namespace.c
+    by accessing /view/v3/usr/fs/namespace.c . The underlying
+    versioning filesystem can then decipher that v3 version of the
+    filesystem is being requested and return the corresponding
+    inode.
 
 5) Detailed semantics
 ---------------------
-	The section below explains the detailed semantics of
-	bind, rbind, move, mount, umount and clone-namespace operations.
-
-	Note: the word 'vfsmount' and the noun 'mount' have been used
-	to mean the same thing, throughout this document.
+The section below explains the detailed semantics of
+bind, rbind, move, mount, umount and clone-namespace operations.
 
-5a) Mount states
+.. Note::
+   the word 'vfsmount' and the noun 'mount' have been used
+   to mean the same thing, throughout this document.
 
-	A given mount can be in one of the following states
+a) Mount states
 
-	1) shared
-	2) slave
-	3) shared and slave
-	4) private
-	5) unbindable
+   A **propagation event** is defined as event generated on a vfsmount
+   that leads to mount or unmount actions in other vfsmounts.
 
-	A 'propagation event' is defined as event generated on a vfsmount
-	that leads to mount or unmount actions in other vfsmounts.
+   A **peer group** is defined as a group of vfsmounts that propagate
+   events to each other.
 
-	A 'peer group' is defined as a group of vfsmounts that propagate
-	events to each other.
+   A given mount can be in one of the following states:
 
-	(1) Shared mounts
+   (1) Shared mounts
 
-		A 'shared mount' is defined as a vfsmount that belongs to a
-		'peer group'.
+       A **shared mount** is defined as a vfsmount that belongs to a
+       peer group.
 
-		For example::
+       For example::
 
-			mount --make-shared /mnt
-			mount --bind /mnt /tmp
+         mount --make-shared /mnt
+         mount --bind /mnt /tmp
 
-		The mount at /mnt and that at /tmp are both shared and belong
-		to the same peer group. Anything mounted or unmounted under
-		/mnt or /tmp reflect in all the other mounts of its peer
-		group.
+       The mount at /mnt and that at /tmp are both shared and belong
+       to the same peer group. Anything mounted or unmounted under
+       /mnt or /tmp reflect in all the other mounts of its peer
+       group.
 
 
-	(2) Slave mounts
+   (2) Slave mounts
 
-		A 'slave mount' is defined as a vfsmount that receives
-		propagation events and does not forward propagation events.
+       A **slave mount** is defined as a vfsmount that receives
+       propagation events and does not forward propagation events.
 
-		A slave mount as the name implies has a master mount from which
-		mount/unmount events are received. Events do not propagate from
-		the slave mount to the master.  Only a shared mount can be made
-		a slave by executing the following command::
+       A slave mount as the name implies has a master mount from which
+       mount/unmount events are received. Events do not propagate from
+       the slave mount to the master.  Only a shared mount can be made
+       a slave by executing the following command::
 
-			mount --make-slave mount
+         mount --make-slave mount
 
-		A shared mount that is made as a slave is no more shared unless
-		modified to become shared.
+       A shared mount that is made as a slave is no more shared unless
+       modified to become shared.
 
-	(3) Shared and Slave
+   (3) Shared and Slave
 
-		A vfsmount can be both shared as well as slave.  This state
-		indicates that the mount is a slave of some vfsmount, and
-		has its own peer group too.  This vfsmount receives propagation
-		events from its master vfsmount, and also forwards propagation
-		events to its 'peer group' and to its slave vfsmounts.
+       A vfsmount can be both **shared** as well as **slave**.  This state
+       indicates that the mount is a slave of some vfsmount, and
+       has its own peer group too.  This vfsmount receives propagation
+       events from its master vfsmount, and also forwards propagation
+       events to its 'peer group' and to its slave vfsmounts.
 
-		Strictly speaking, the vfsmount is shared having its own
-		peer group, and this peer-group is a slave of some other
-		peer group.
+       Strictly speaking, the vfsmount is shared having its own
+       peer group, and this peer-group is a slave of some other
+       peer group.
 
-		Only a slave vfsmount can be made as 'shared and slave' by
-		either executing the following command::
+       Only a slave vfsmount can be made as 'shared and slave' by
+       either executing the following command::
 
-			mount --make-shared mount
+         mount --make-shared mount
 
-		or by moving the slave vfsmount under a shared vfsmount.
+       or by moving the slave vfsmount under a shared vfsmount.
 
-	(4) Private mount
+   (4) Private mount
 
-		A 'private mount' is defined as vfsmount that does not
-		receive or forward any propagation events.
+       A **private mount** is defined as vfsmount that does not
+       receive or forward any propagation events.
 
-	(5) Unbindable mount
+   (5) Unbindable mount
 
-		A 'unbindable mount' is defined as vfsmount that does not
-		receive or forward any propagation events and cannot
-		be bind mounted.
+       A **unbindable mount** is defined as vfsmount that does not
+       receive or forward any propagation events and cannot
+       be bind mounted.
 
 
-   	State diagram:
+       State diagram:
 
-   	The state diagram below explains the state transition of a mount,
-	in response to various commands::
+       The state diagram below explains the state transition of a mount,
+       in response to various commands::
 
-	    -----------------------------------------------------------------------
-	    |             |make-shared |  make-slave  | make-private |make-unbindab|
-	    --------------|------------|--------------|--------------|-------------|
-	    |shared	  |shared      |*slave/private|   private    | unbindable  |
-	    |             |            |              |              |             |
-	    |-------------|------------|--------------|--------------|-------------|
-	    |slave	  |shared      | **slave      |    private   | unbindable  |
-	    |             |and slave   |              |              |             |
-	    |-------------|------------|--------------|--------------|-------------|
-	    |shared       |shared      | slave        |    private   | unbindable  |
-	    |and slave    |and slave   |              |              |             |
-	    |-------------|------------|--------------|--------------|-------------|
-	    |private      |shared      |  **private   |    private   | unbindable  |
-	    |-------------|------------|--------------|--------------|-------------|
-	    |unbindable   |shared      |**unbindable  |    private   | unbindable  |
-	    ------------------------------------------------------------------------
+            -----------------------------------------------------------------------
+            |             |make-shared |  make-slave  | make-private |make-unbindab|
+            --------------|------------|--------------|--------------|-------------|
+            |shared       |shared      |*slave/private|   private    | unbindable  |
+            |             |            |              |              |             |
+            |-------------|------------|--------------|--------------|-------------|
+            |slave        |shared      | **slave      |    private   | unbindable  |
+            |             |and slave   |              |              |             |
+            |-------------|------------|--------------|--------------|-------------|
+            |shared       |shared      | slave        |    private   | unbindable  |
+            |and slave    |and slave   |              |              |             |
+            |-------------|------------|--------------|--------------|-------------|
+            |private      |shared      |  **private   |    private   | unbindable  |
+            |-------------|------------|--------------|--------------|-------------|
+            |unbindable   |shared      |**unbindable  |    private   | unbindable  |
+            ------------------------------------------------------------------------
 
-	    * if the shared mount is the only mount in its peer group, making it
-	    slave, makes it private automatically. Note that there is no master to
-	    which it can be slaved to.
+            * if the shared mount is the only mount in its peer group, making it
+            slave, makes it private automatically. Note that there is no master to
+            which it can be slaved to.
 
-	    ** slaving a non-shared mount has no effect on the mount.
+            ** slaving a non-shared mount has no effect on the mount.
 
-	Apart from the commands listed below, the 'move' operation also changes
-	the state of a mount depending on type of the destination mount. Its
-	explained in section 5d.
+       Apart from the commands listed below, the 'move' operation also changes
+       the state of a mount depending on type of the destination mount. Its
+       explained in section 5d.
 
-5b) Bind semantics
+b) Bind semantics
 
-	Consider the following command::
+   Consider the following command::
 
-	    mount --bind A/a  B/b
+     mount --bind A/a  B/b
 
-	where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B'
-	is the destination mount and 'b' is the dentry in the destination mount.
+   where 'A' is the source mount, 'a' is the dentry in the mount 'A', 'B'
+   is the destination mount and 'b' is the dentry in the destination mount.
 
-	The outcome depends on the type of mount of 'A' and 'B'. The table
-	below contains quick reference::
+   The outcome depends on the type of mount of 'A' and 'B'. The table
+   below contains quick reference::
 
-	    --------------------------------------------------------------------------
-	    |         BIND MOUNT OPERATION                                           |
-	    |************************************************************************|
-	    |source(A)->| shared      |       private  |       slave    | unbindable |
-	    | dest(B)  |              |                |                |            |
-	    |   |      |              |                |                |            |
-	    |   v      |              |                |                |            |
-	    |************************************************************************|
-	    |  shared  | shared       |     shared     | shared & slave |  invalid   |
-	    |          |              |                |                |            |
-	    |non-shared| shared       |      private   |      slave     |  invalid   |
-	    **************************************************************************
+            --------------------------------------------------------------------------
+            |         BIND MOUNT OPERATION                                           |
+            |************************************************************************|
+            |source(A)->| shared      |       private  |       slave    | unbindable |
+            | dest(B)  |              |                |                |            |
+            |   |      |              |                |                |            |
+            |   v      |              |                |                |            |
+            |************************************************************************|
+            |  shared  | shared       |     shared     | shared & slave |  invalid   |
+            |          |              |                |                |            |
+            |non-shared| shared       |      private   |      slave     |  invalid   |
+            **************************************************************************
 
-     	Details:
+   Details:
 
-    1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C'
-	which is clone of 'A', is created. Its root dentry is 'a' . 'C' is
-	mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
-	are created and mounted at the dentry 'b' on all mounts where 'B'
-	propagates to. A new propagation tree containing 'C1',..,'Cn' is
-	created. This propagation tree is identical to the propagation tree of
-	'B'.  And finally the peer-group of 'C' is merged with the peer group
-	of 'A'.
+   1. 'A' is a shared mount and 'B' is a shared mount. A new mount 'C'
+      which is clone of 'A', is created. Its root dentry is 'a' . 'C' is
+      mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
+      are created and mounted at the dentry 'b' on all mounts where 'B'
+      propagates to. A new propagation tree containing 'C1',..,'Cn' is
+      created. This propagation tree is identical to the propagation tree of
+      'B'.  And finally the peer-group of 'C' is merged with the peer group
+      of 'A'.
 
-    2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C'
-	which is clone of 'A', is created. Its root dentry is 'a'. 'C' is
-	mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
-	are created and mounted at the dentry 'b' on all mounts where 'B'
-	propagates to. A new propagation tree is set containing all new mounts
-	'C', 'C1', .., 'Cn' with exactly the same configuration as the
-	propagation tree for 'B'.
+   2. 'A' is a private mount and 'B' is a shared mount. A new mount 'C'
+      which is clone of 'A', is created. Its root dentry is 'a'. 'C' is
+      mounted on mount 'B' at dentry 'b'. Also new mount 'C1', 'C2', 'C3' ...
+      are created and mounted at the dentry 'b' on all mounts where 'B'
+      propagates to. A new propagation tree is set containing all new mounts
+      'C', 'C1', .., 'Cn' with exactly the same configuration as the
+      propagation tree for 'B'.
 
-    3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new
-	mount 'C' which is clone of 'A', is created. Its root dentry is 'a' .
-	'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2',
-	'C3' ... are created and mounted at the dentry 'b' on all mounts where
-	'B' propagates to. A new propagation tree containing the new mounts
-	'C','C1',..  'Cn' is created. This propagation tree is identical to the
-	propagation tree for 'B'. And finally the mount 'C' and its peer group
-	is made the slave of mount 'Z'.  In other words, mount 'C' is in the
-	state 'slave and shared'.
-
-    4. 'A' is a unbindable mount and 'B' is a shared mount. This is a
-	invalid operation.
-
-    5. 'A' is a private mount and 'B' is a non-shared(private or slave or
-	unbindable) mount. A new mount 'C' which is clone of 'A', is created.
-	Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'.
-
-    6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C'
-	which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is
-	mounted on mount 'B' at dentry 'b'.  'C' is made a member of the
-	peer-group of 'A'.
-
-    7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A
-	new mount 'C' which is a clone of 'A' is created. Its root dentry is
-	'a'.  'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a
-	slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of
-	'Z'.  All mount/unmount events on 'Z' propagates to 'A' and 'C'. But
-	mount/unmount on 'A' do not propagate anywhere else. Similarly
-	mount/unmount on 'C' do not propagate anywhere else.
-
-    8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a
-	invalid operation. A unbindable mount cannot be bind mounted.
-
-5c) Rbind semantics
-
-	rbind is same as bind. Bind replicates the specified mount.  Rbind
-	replicates all the mounts in the tree belonging to the specified mount.
-	Rbind mount is bind mount applied to all the mounts in the tree.
-
-	If the source tree that is rbind has some unbindable mounts,
-	then the subtree under the unbindable mount is pruned in the new
-	location.
-
-	eg:
-
-	  let's say we have the following mount tree::
-
-		A
-	      /   \
-	      B   C
-	     / \ / \
-	     D E F G
-
-	  Let's say all the mount except the mount C in the tree are
-	  of a type other than unbindable.
-
-	  If this tree is rbound to say Z
-
-	  We will have the following tree at the new location::
-
-		Z
-		|
-		A'
-	       /
-	      B'		Note how the tree under C is pruned
-	     / \ 		in the new location.
-	    D' E'
-
-
-
-5d) Move semantics
-
-	Consider the following command
-
-	mount --move A  B/b
+   3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount. A new
+      mount 'C' which is clone of 'A', is created. Its root dentry is 'a' .
+      'C' is mounted on mount 'B' at dentry 'b'. Also new mounts 'C1', 'C2',
+      'C3' ... are created and mounted at the dentry 'b' on all mounts where
+      'B' propagates to. A new propagation tree containing the new mounts
+      'C','C1',..  'Cn' is created. This propagation tree is identical to the
+      propagation tree for 'B'. And finally the mount 'C' and its peer group
+      is made the slave of mount 'Z'.  In other words, mount 'C' is in the
+      state 'slave and shared'.
+
+   4. 'A' is a unbindable mount and 'B' is a shared mount. This is a
+      invalid operation.
+
+   5. 'A' is a private mount and 'B' is a non-shared(private or slave or
+      unbindable) mount. A new mount 'C' which is clone of 'A', is created.
+      Its root dentry is 'a'. 'C' is mounted on mount 'B' at dentry 'b'.
+
+   6. 'A' is a shared mount and 'B' is a non-shared mount. A new mount 'C'
+      which is a clone of 'A' is created. Its root dentry is 'a'. 'C' is
+      mounted on mount 'B' at dentry 'b'.  'C' is made a member of the
+      peer-group of 'A'.
+
+   7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount. A
+      new mount 'C' which is a clone of 'A' is created. Its root dentry is
+      'a'.  'C' is mounted on mount 'B' at dentry 'b'. Also 'C' is set as a
+      slave mount of 'Z'. In other words 'A' and 'C' are both slave mounts of
+      'Z'.  All mount/unmount events on 'Z' propagates to 'A' and 'C'. But
+      mount/unmount on 'A' do not propagate anywhere else. Similarly
+      mount/unmount on 'C' do not propagate anywhere else.
+
+   8. 'A' is a unbindable mount and 'B' is a non-shared mount. This is a
+      invalid operation. A unbindable mount cannot be bind mounted.
+
+c) Rbind semantics
+
+   rbind is same as bind. Bind replicates the specified mount.  Rbind
+   replicates all the mounts in the tree belonging to the specified mount.
+   Rbind mount is bind mount applied to all the mounts in the tree.
+
+   If the source tree that is rbind has some unbindable mounts,
+   then the subtree under the unbindable mount is pruned in the new
+   location.
+
+   eg:
+
+   let's say we have the following mount tree::
+
+                A
+              /   \
+              B   C
+             / \ / \
+             D E F G
+
+   Let's say all the mount except the mount C in the tree are
+   of a type other than unbindable.
+
+   If this tree is rbound to say Z
+
+   We will have the following tree at the new location::
+
+                Z
+                |
+                A'
+               /
+              B'                Note how the tree under C is pruned
+             / \                in the new location.
+            D' E'
+
+
+
+d) Move semantics
+
+   Consider the following command::
+
+     mount --move A  B/b
 
-	where 'A' is the source mount, 'B' is the destination mount and 'b' is
-	the dentry in the destination mount.
+   where 'A' is the source mount, 'B' is the destination mount and 'b' is
+   the dentry in the destination mount.
 
-	The outcome depends on the type of the mount of 'A' and 'B'. The table
-	below is a quick reference::
+   The outcome depends on the type of the mount of 'A' and 'B'. The table
+   below is a quick reference::
 
-	    ---------------------------------------------------------------------------
-	    |         		MOVE MOUNT OPERATION                                 |
-	    |**************************************************************************
-	    | source(A)->| shared      |       private  |       slave    | unbindable |
-	    | dest(B)  |               |                |                |            |
-	    |   |      |               |                |                |            |
-	    |   v      |               |                |                |            |
-	    |**************************************************************************
-	    |  shared  | shared        |     shared     |shared and slave|  invalid   |
-	    |          |               |                |                |            |
-	    |non-shared| shared        |      private   |    slave       | unbindable |
-	    ***************************************************************************
+            ---------------------------------------------------------------------------
+            |                   MOVE MOUNT OPERATION                                 |
+            |**************************************************************************
+            | source(A)->| shared      |       private  |       slave    | unbindable |
+            | dest(B)  |               |                |                |            |
+            |   |      |               |                |                |            |
+            |   v      |               |                |                |            |
+            |**************************************************************************
+            |  shared  | shared        |     shared     |shared and slave|  invalid   |
+            |          |               |                |                |            |
+            |non-shared| shared        |      private   |    slave       | unbindable |
+            ***************************************************************************
 
-	.. Note:: moving a mount residing under a shared mount is invalid.
+   .. Note:: moving a mount residing under a shared mount is invalid.
 
-      Details follow:
+   Details follow:
 
-    1. 'A' is a shared mount and 'B' is a shared mount.  The mount 'A' is
-	mounted on mount 'B' at dentry 'b'.  Also new mounts 'A1', 'A2'...'An'
-	are created and mounted at dentry 'b' on all mounts that receive
-	propagation from mount 'B'. A new propagation tree is created in the
-	exact same configuration as that of 'B'. This new propagation tree
-	contains all the new mounts 'A1', 'A2'...  'An'.  And this new
-	propagation tree is appended to the already existing propagation tree
-	of 'A'.
+   1. 'A' is a shared mount and 'B' is a shared mount.  The mount 'A' is
+      mounted on mount 'B' at dentry 'b'.  Also new mounts 'A1', 'A2'...'An'
+      are created and mounted at dentry 'b' on all mounts that receive
+      propagation from mount 'B'. A new propagation tree is created in the
+      exact same configuration as that of 'B'. This new propagation tree
+      contains all the new mounts 'A1', 'A2'...  'An'.  And this new
+      propagation tree is appended to the already existing propagation tree
+      of 'A'.
 
-    2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is
-	mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An'
-	are created and mounted at dentry 'b' on all mounts that receive
-	propagation from mount 'B'. The mount 'A' becomes a shared mount and a
-	propagation tree is created which is identical to that of
-	'B'. This new propagation tree contains all the new mounts 'A1',
-	'A2'...  'An'.
+   2. 'A' is a private mount and 'B' is a shared mount. The mount 'A' is
+      mounted on mount 'B' at dentry 'b'. Also new mount 'A1', 'A2'... 'An'
+      are created and mounted at dentry 'b' on all mounts that receive
+      propagation from mount 'B'. The mount 'A' becomes a shared mount and a
+      propagation tree is created which is identical to that of
+      'B'. This new propagation tree contains all the new mounts 'A1',
+      'A2'...  'An'.
 
-    3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount.  The
-	mount 'A' is mounted on mount 'B' at dentry 'b'.  Also new mounts 'A1',
-	'A2'... 'An' are created and mounted at dentry 'b' on all mounts that
-	receive propagation from mount 'B'. A new propagation tree is created
-	in the exact same configuration as that of 'B'. This new propagation
-	tree contains all the new mounts 'A1', 'A2'...  'An'.  And this new
-	propagation tree is appended to the already existing propagation tree of
-	'A'.  Mount 'A' continues to be the slave mount of 'Z' but it also
-	becomes 'shared'.
+   3. 'A' is a slave mount of mount 'Z' and 'B' is a shared mount.  The
+      mount 'A' is mounted on mount 'B' at dentry 'b'.  Also new mounts 'A1',
+      'A2'... 'An' are created and mounted at dentry 'b' on all mounts that
+      receive propagation from mount 'B'. A new propagation tree is created
+      in the exact same configuration as that of 'B'. This new propagation
+      tree contains all the new mounts 'A1', 'A2'...  'An'.  And this new
+      propagation tree is appended to the already existing propagation tree of
+      'A'.  Mount 'A' continues to be the slave mount of 'Z' but it also
+      becomes 'shared'.
 
-    4. 'A' is a unbindable mount and 'B' is a shared mount. The operation
-	is invalid. Because mounting anything on the shared mount 'B' can
-	create new mounts that get mounted on the mounts that receive
-	propagation from 'B'.  And since the mount 'A' is unbindable, cloning
-	it to mount at other mountpoints is not possible.
+   4. 'A' is a unbindable mount and 'B' is a shared mount. The operation
+      is invalid. Because mounting anything on the shared mount 'B' can
+      create new mounts that get mounted on the mounts that receive
+      propagation from 'B'.  And since the mount 'A' is unbindable, cloning
+      it to mount at other mountpoints is not possible.
 
-    5. 'A' is a private mount and 'B' is a non-shared(private or slave or
-	unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'.
+   5. 'A' is a private mount and 'B' is a non-shared(private or slave or
+      unbindable) mount. The mount 'A' is mounted on mount 'B' at dentry 'b'.
 
-    6. 'A' is a shared mount and 'B' is a non-shared mount.  The mount 'A'
-	is mounted on mount 'B' at dentry 'b'.  Mount 'A' continues to be a
-	shared mount.
+   6. 'A' is a shared mount and 'B' is a non-shared mount.  The mount 'A'
+      is mounted on mount 'B' at dentry 'b'.  Mount 'A' continues to be a
+      shared mount.
 
-    7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount.
-	The mount 'A' is mounted on mount 'B' at dentry 'b'.  Mount 'A'
-	continues to be a slave mount of mount 'Z'.
+   7. 'A' is a slave mount of mount 'Z' and 'B' is a non-shared mount.
+      The mount 'A' is mounted on mount 'B' at dentry 'b'.  Mount 'A'
+      continues to be a slave mount of mount 'Z'.
 
-    8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount
-	'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a
-	unbindable mount.
+   8. 'A' is a unbindable mount and 'B' is a non-shared mount. The mount
+      'A' is mounted on mount 'B' at dentry 'b'. Mount 'A' continues to be a
+      unbindable mount.
 
-5e) Mount semantics
+e) Mount semantics
 
-	Consider the following command::
+   Consider the following command::
 
-	    mount device  B/b
+     mount device  B/b
 
-	'B' is the destination mount and 'b' is the dentry in the destination
-	mount.
+   'B' is the destination mount and 'b' is the dentry in the destination
+   mount.
 
-	The above operation is the same as bind operation with the exception
-	that the source mount is always a private mount.
+   The above operation is the same as bind operation with the exception
+   that the source mount is always a private mount.
 
 
-5f) Unmount semantics
+f) Unmount semantics
 
-	Consider the following command::
+   Consider the following command::
 
-	    umount A
+     umount A
 
-	where 'A' is a mount mounted on mount 'B' at dentry 'b'.
+   where 'A' is a mount mounted on mount 'B' at dentry 'b'.
 
-	If mount 'B' is shared, then all most-recently-mounted mounts at dentry
-	'b' on mounts that receive propagation from mount 'B' and does not have
-	sub-mounts within them are unmounted.
+   If mount 'B' is shared, then all most-recently-mounted mounts at dentry
+   'b' on mounts that receive propagation from mount 'B' and does not have
+   sub-mounts within them are unmounted.
 
-	Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
-	each other.
+   Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
+   each other.
 
-	let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
-	'B1', 'B2' and 'B3' respectively.
+   let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+   'B1', 'B2' and 'B3' respectively.
 
-	let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
-	mount 'B1', 'B2' and 'B3' respectively.
+   let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+   mount 'B1', 'B2' and 'B3' respectively.
 
-	if 'C1' is unmounted, all the mounts that are most-recently-mounted on
-	'B1' and on the mounts that 'B1' propagates-to are unmounted.
+   if 'C1' is unmounted, all the mounts that are most-recently-mounted on
+   'B1' and on the mounts that 'B1' propagates-to are unmounted.
 
-	'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount
-	on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'.
+   'B1' propagates to 'B2' and 'B3'. And the most recently mounted mount
+   on 'B2' at dentry 'b' is 'C2', and that of mount 'B3' is 'C3'.
 
-	So all 'C1', 'C2' and 'C3' should be unmounted.
+   So all 'C1', 'C2' and 'C3' should be unmounted.
 
-	If any of 'C2' or 'C3' has some child mounts, then that mount is not
-	unmounted, but all other mounts are unmounted. However if 'C1' is told
-	to be unmounted and 'C1' has some sub-mounts, the umount operation is
-	failed entirely.
+   If any of 'C2' or 'C3' has some child mounts, then that mount is not
+   unmounted, but all other mounts are unmounted. However if 'C1' is told
+   to be unmounted and 'C1' has some sub-mounts, the umount operation is
+   failed entirely.
 
-5g) Clone Namespace
+g) Clone Namespace
 
-	A cloned namespace contains all the mounts as that of the parent
-	namespace.
+   A cloned namespace contains all the mounts as that of the parent
+   namespace.
 
-	Let's say 'A' and 'B' are the corresponding mounts in the parent and the
-	child namespace.
+   Let's say 'A' and 'B' are the corresponding mounts in the parent and the
+   child namespace.
 
-	If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
-	each other.
+   If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
+   each other.
 
-	If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of
-	'Z'.
+   If 'A' is a slave mount of 'Z', then 'B' is also the slave mount of
+   'Z'.
 
-	If 'A' is a private mount, then 'B' is a private mount too.
+   If 'A' is a private mount, then 'B' is a private mount too.
 
-	If 'A' is unbindable mount, then 'B' is a unbindable mount too.
+   If 'A' is unbindable mount, then 'B' is a unbindable mount too.
 
 
 6) Quiz
 -------
 
-	A. What is the result of the following command sequence?
+A. What is the result of the following command sequence?
 
-		::
+   ::
 
-		    mount --bind /mnt /mnt
-		    mount --make-shared /mnt
-		    mount --bind /mnt /tmp
-		    mount --move /tmp /mnt/1
+       mount --bind /mnt /mnt
+       mount --make-shared /mnt
+       mount --bind /mnt /tmp
+       mount --move /tmp /mnt/1
 
-		what should be the contents of /mnt /mnt/1 /mnt/1/1 should be?
-		Should they all be identical? or should /mnt and /mnt/1 be
-		identical only?
+   what should be the contents of /mnt /mnt/1 /mnt/1/1 should be?
+   Should they all be identical? or should /mnt and /mnt/1 be
+   identical only?
 
 
-	B. What is the result of the following command sequence?
+B. What is the result of the following command sequence?
 
-		::
+   ::
 
-		    mount --make-rshared /
-		    mkdir -p /v/1
-		    mount --rbind / /v/1
+       mount --make-rshared /
+       mkdir -p /v/1
+       mount --rbind / /v/1
 
-		what should be the content of /v/1/v/1 be?
+   what should be the content of /v/1/v/1 be?
 
 
-	C. What is the result of the following command sequence?
+C. What is the result of the following command sequence?
 
-		::
+   ::
 
-		    mount --bind /mnt /mnt
-		    mount --make-shared /mnt
-		    mkdir -p /mnt/1/2/3 /mnt/1/test
-		    mount --bind /mnt/1 /tmp
-		    mount --make-slave /mnt
-		    mount --make-shared /mnt
-		    mount --bind /mnt/1/2 /tmp1
-		    mount --make-slave /mnt
+       mount --bind /mnt /mnt
+       mount --make-shared /mnt
+       mkdir -p /mnt/1/2/3 /mnt/1/test
+       mount --bind /mnt/1 /tmp
+       mount --make-slave /mnt
+       mount --make-shared /mnt
+       mount --bind /mnt/1/2 /tmp1
+       mount --make-slave /mnt
 
-		At this point we have the first mount at /tmp and
-		its root dentry is 1. Let's call this mount 'A'
-		And then we have a second mount at /tmp1 with root
-		dentry 2. Let's call this mount 'B'
-		Next we have a third mount at /mnt with root dentry
-		mnt. Let's call this mount 'C'
+   At this point we have the first mount at /tmp and
+   its root dentry is 1. Let's call this mount 'A'
+   And then we have a second mount at /tmp1 with root
+   dentry 2. Let's call this mount 'B'
+   Next we have a third mount at /mnt with root dentry
+   mnt. Let's call this mount 'C'
 
-		'B' is the slave of 'A' and 'C' is a slave of 'B'
-		A -> B -> C
+   'B' is the slave of 'A' and 'C' is a slave of 'B'
+   A -> B -> C
 
-		at this point if we execute the following command
+   at this point if we execute the following command::
 
-		mount --bind /bin /tmp/test
+     mount --bind /bin /tmp/test
 
-		The mount is attempted on 'A'
+   The mount is attempted on 'A'
 
-		will the mount propagate to 'B' and 'C' ?
+   will the mount propagate to 'B' and 'C' ?
 
-		what would be the contents of
-		/mnt/1/test be?
+   what would be the contents of
+   /mnt/1/test be?
 
 7) FAQ
 ------
 
-	Q1. Why is bind mount needed? How is it different from symbolic links?
-		symbolic links can get stale if the destination mount gets
-		unmounted or moved. Bind mounts continue to exist even if the
-		other mount is unmounted or moved.
+1. Why is bind mount needed? How is it different from symbolic links?
 
-	Q2. Why can't the shared subtree be implemented using exportfs?
+   symbolic links can get stale if the destination mount gets
+   unmounted or moved. Bind mounts continue to exist even if the
+   other mount is unmounted or moved.
 
-		exportfs is a heavyweight way of accomplishing part of what
-		shared subtree can do. I cannot imagine a way to implement the
-		semantics of slave mount using exportfs?
+2. Why can't the shared subtree be implemented using exportfs?
 
-	Q3 Why is unbindable mount needed?
+   exportfs is a heavyweight way of accomplishing part of what
+   shared subtree can do. I cannot imagine a way to implement the
+   semantics of slave mount using exportfs?
 
-		Let's say we want to replicate the mount tree at multiple
-		locations within the same subtree.
+3. Why is unbindable mount needed?
 
-		if one rbind mounts a tree within the same subtree 'n' times
-		the number of mounts created is an exponential function of 'n'.
-		Having unbindable mount can help prune the unneeded bind
-		mounts. Here is an example.
+   Let's say we want to replicate the mount tree at multiple
+   locations within the same subtree.
 
-		step 1:
-		   let's say the root tree has just two directories with
-		   one vfsmount::
+   if one rbind mounts a tree within the same subtree 'n' times
+   the number of mounts created is an exponential function of 'n'.
+   Having unbindable mount can help prune the unneeded bind
+   mounts. Here is an example.
 
-				    root
-				   /    \
-				  tmp    usr
+   step 1:
+      let's say the root tree has just two directories with
+      one vfsmount::
 
-		    And we want to replicate the tree at multiple
-		    mountpoints under /root/tmp
+                                    root
+                                   /    \
+                                  tmp    usr
 
-		step 2:
-		      ::
+      And we want to replicate the tree at multiple
+      mountpoints under /root/tmp
 
+   step 2:
+      ::
 
-			mount --make-shared /root
 
-			mkdir -p /tmp/m1
+                        mount --make-shared /root
 
-			mount --rbind /root /tmp/m1
+                        mkdir -p /tmp/m1
 
-		      the new tree now looks like this::
+                        mount --rbind /root /tmp/m1
 
-				    root
-				   /    \
-				 tmp    usr
-				/
-			       m1
-			      /  \
-			     tmp  usr
-			     /
-			    m1
+      the new tree now looks like this::
 
-			  it has two vfsmounts
+                                    root
+                                   /    \
+                                 tmp    usr
+                                /
+                               m1
+                              /  \
+                             tmp  usr
+                             /
+                            m1
 
-		step 3:
-		    ::
+      it has two vfsmounts
 
-			    mkdir -p /tmp/m2
-			    mount --rbind /root /tmp/m2
+   step 3:
+      ::
 
-			the new tree now looks like this::
+                            mkdir -p /tmp/m2
+                            mount --rbind /root /tmp/m2
 
-				      root
-				     /    \
-				   tmp     usr
-				  /    \
-				m1       m2
-			       / \       /  \
-			     tmp  usr   tmp  usr
-			     / \          /
-			    m1  m2      m1
-				/ \     /  \
-			      tmp usr  tmp   usr
-			      /        / \
-			     m1       m1  m2
-			    /  \
-			  tmp   usr
-			  /  \
-			 m1   m2
+      the new tree now looks like this::
 
-		       it has 6 vfsmounts
+                                      root
+                                     /    \
+                                   tmp     usr
+                                  /    \
+                                m1       m2
+                               / \       /  \
+                             tmp  usr   tmp  usr
+                             / \          /
+                            m1  m2      m1
+                                / \     /  \
+                              tmp usr  tmp   usr
+                              /        / \
+                             m1       m1  m2
+                            /  \
+                          tmp   usr
+                          /  \
+                         m1   m2
 
-		step 4:
-		      ::
-			  mkdir -p /tmp/m3
-			  mount --rbind /root /tmp/m3
+                    it has 6 vfsmounts
 
-			  I won't draw the tree..but it has 24 vfsmounts
+   step 4:
+      ::
 
+                          mkdir -p /tmp/m3
+                          mount --rbind /root /tmp/m3
 
-		at step i the number of vfsmounts is V[i] = i*V[i-1].
-		This is an exponential function. And this tree has way more
-		mounts than what we really needed in the first place.
+      I won't draw the tree..but it has 24 vfsmounts
 
-		One could use a series of umount at each step to prune
-		out the unneeded mounts. But there is a better solution.
-		Unclonable mounts come in handy here.
 
-		step 1:
-		   let's say the root tree has just two directories with
-		   one vfsmount::
+   at step i the number of vfsmounts is V[i] = i*V[i-1].
+   This is an exponential function. And this tree has way more
+   mounts than what we really needed in the first place.
 
-				    root
-				   /    \
-				  tmp    usr
+   One could use a series of umount at each step to prune
+   out the unneeded mounts. But there is a better solution.
+   Unclonable mounts come in handy here.
 
-		    How do we set up the same tree at multiple locations under
-		    /root/tmp
+   step 1:
+      let's say the root tree has just two directories with
+      one vfsmount::
 
-		step 2:
-		      ::
+                                    root
+                                   /    \
+                                  tmp    usr
 
+         How do we set up the same tree at multiple locations under
+         /root/tmp
 
-			mount --bind /root/tmp /root/tmp
+   step 2:
+      ::
 
-			mount --make-rshared /root
-			mount --make-unbindable /root/tmp
 
-			mkdir -p /tmp/m1
+                        mount --bind /root/tmp /root/tmp
 
-			mount --rbind /root /tmp/m1
+                        mount --make-rshared /root
+                        mount --make-unbindable /root/tmp
 
-		      the new tree now looks like this::
+                        mkdir -p /tmp/m1
 
-				    root
-				   /    \
-				 tmp    usr
-				/
-			       m1
-			      /  \
-			     tmp  usr
+                        mount --rbind /root /tmp/m1
 
-		step 3:
-		      ::
+      the new tree now looks like this::
 
-			    mkdir -p /tmp/m2
-			    mount --rbind /root /tmp/m2
+                                    root
+                                   /    \
+                                 tmp    usr
+                                /
+                               m1
+                              /  \
+                             tmp  usr
 
-		      the new tree now looks like this::
+   step 3:
+      ::
 
-				    root
-				   /    \
-				 tmp    usr
-				/   \
-			       m1     m2
-			      /  \     / \
-			     tmp  usr tmp usr
+                            mkdir -p /tmp/m2
+                            mount --rbind /root /tmp/m2
 
-		step 4:
-		      ::
+      the new tree now looks like this::
 
-			    mkdir -p /tmp/m3
-			    mount --rbind /root /tmp/m3
+                                    root
+                                   /    \
+                                 tmp    usr
+                                /   \
+                               m1     m2
+                              /  \     / \
+                             tmp  usr tmp usr
 
-		      the new tree now looks like this::
+   step 4:
+      ::
 
-				    	  root
-				      /    	  \
-				     tmp    	   usr
-			         /    \    \
-			       m1     m2     m3
-			      /  \     / \    /  \
-			     tmp  usr tmp usr tmp usr
+                            mkdir -p /tmp/m3
+                            mount --rbind /root /tmp/m3
+
+      the new tree now looks like this::
+
+                                          root
+                                      /           \
+                                     tmp           usr
+                                 /    \    \
+                               m1     m2     m3
+                              /  \     / \    /  \
+                             tmp  usr tmp usr tmp usr
 
 8) Implementation
 -----------------
 
-8A) Datastructure
+A) Datastructure
+
+   Several new fields are introduced to struct vfsmount:
+
+   ->mnt_share
+           Links together all the mount to/from which this vfsmount
+           send/receives propagation events.
 
-	4 new fields are introduced to struct vfsmount:
+   ->mnt_slave_list
+           Links all the mounts to which this vfsmount propagates
+           to.
 
-	*   ->mnt_share
-	*   ->mnt_slave_list
-	*   ->mnt_slave
-	*   ->mnt_master
+   ->mnt_slave
+           Links together all the slaves that its master vfsmount
+           propagates to.
 
-	->mnt_share
-		links together all the mount to/from which this vfsmount
-		send/receives propagation events.
+   ->mnt_master
+           Points to the master vfsmount from which this vfsmount
+           receives propagation.
 
-	->mnt_slave_list
-		links all the mounts to which this vfsmount propagates
-		to.
+   ->mnt_flags
+           Takes two more flags to indicate the propagation status of
+           the vfsmount.  MNT_SHARE indicates that the vfsmount is a shared
+           vfsmount.  MNT_UNCLONABLE indicates that the vfsmount cannot be
+           replicated.
 
-	->mnt_slave
-		links together all the slaves that its master vfsmount
-		propagates to.
+   All the shared vfsmounts in a peer group form a cyclic list through
+   ->mnt_share.
 
-	->mnt_master
-		points to the master vfsmount from which this vfsmount
-		receives propagation.
+   All vfsmounts with the same ->mnt_master form on a cyclic list anchored
+   in ->mnt_master->mnt_slave_list and going through ->mnt_slave.
 
-	->mnt_flags
-		takes two more flags to indicate the propagation status of
-		the vfsmount.  MNT_SHARE indicates that the vfsmount is a shared
-		vfsmount.  MNT_UNCLONABLE indicates that the vfsmount cannot be
-		replicated.
+   ->mnt_master can point to arbitrary (and possibly different) members
+   of master peer group.  To find all immediate slaves of a peer group
+   you need to go through _all_ ->mnt_slave_list of its members.
+   Conceptually it's just a single set - distribution among the
+   individual lists does not affect propagation or the way propagation
+   tree is modified by operations.
 
-	All the shared vfsmounts in a peer group form a cyclic list through
-	->mnt_share.
+   All vfsmounts in a peer group have the same ->mnt_master.  If it is
+   non-NULL, they form a contiguous (ordered) segment of slave list.
 
-	All vfsmounts with the same ->mnt_master form on a cyclic list anchored
-	in ->mnt_master->mnt_slave_list and going through ->mnt_slave.
+   A example propagation tree looks as shown in the figure below.
 
-	 ->mnt_master can point to arbitrary (and possibly different) members
-	 of master peer group.  To find all immediate slaves of a peer group
-	 you need to go through _all_ ->mnt_slave_list of its members.
-	 Conceptually it's just a single set - distribution among the
-	 individual lists does not affect propagation or the way propagation
-	 tree is modified by operations.
+   .. note::
+      Though it looks like a forest, if we consider all the shared
+      mounts as a conceptual entity called 'pnode', it becomes a tree.
 
-	All vfsmounts in a peer group have the same ->mnt_master.  If it is
-	non-NULL, they form a contiguous (ordered) segment of slave list.
+   ::
 
-	A example propagation tree looks as shown in the figure below.
-	[ NOTE: Though it looks like a forest, if we consider all the shared
-	mounts as a conceptual entity called 'pnode', it becomes a tree]::
 
+                        A <--> B <--> C <---> D
+                       /|\            /|      |\
+                      / F G          J K      H I
+                     /
+                    E<-->K
+                        /|\
+                       M L N
 
-		        A <--> B <--> C <---> D
-		       /|\	      /|      |\
-		      / F G	     J K      H I
-		     /
-		    E<-->K
-			/|\
-		       M L N
+   In the above figure  A,B,C and D all are shared and propagate to each
+   other.   'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave
+   mounts 'J' and 'K'  and  'D' has got two slave mounts 'H' and 'I'.
+   'E' is also shared with 'K' and they propagate to each other.  And
+   'K' has 3 slaves 'M', 'L' and 'N'
 
-	In the above figure  A,B,C and D all are shared and propagate to each
-	other.   'A' has got 3 slave mounts 'E' 'F' and 'G' 'C' has got 2 slave
-	mounts 'J' and 'K'  and  'D' has got two slave mounts 'H' and 'I'.
-	'E' is also shared with 'K' and they propagate to each other.  And
-	'K' has 3 slaves 'M', 'L' and 'N'
+   A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D'
 
-	A's ->mnt_share links with the ->mnt_share of 'B' 'C' and 'D'
+   A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G'
 
-	A's ->mnt_slave_list links with ->mnt_slave of 'E', 'K', 'F' and 'G'
+   E's ->mnt_share links with ->mnt_share of K
 
-	E's ->mnt_share links with ->mnt_share of K
+   'E', 'K', 'F', 'G' have their ->mnt_master point to struct vfsmount of 'A'
 
-	'E', 'K', 'F', 'G' have their ->mnt_master point to struct vfsmount of 'A'
+   'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K'
 
-	'M', 'L', 'N' have their ->mnt_master point to struct vfsmount of 'K'
+   K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N'
 
-	K's ->mnt_slave_list links with ->mnt_slave of 'M', 'L' and 'N'
+   C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K'
 
-	C's ->mnt_slave_list links with ->mnt_slave of 'J' and 'K'
+   J and K's ->mnt_master points to struct vfsmount of C
 
-	J and K's ->mnt_master points to struct vfsmount of C
+   and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I'
 
-	and finally D's ->mnt_slave_list links with ->mnt_slave of 'H' and 'I'
+   'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'.
 
-	'H' and 'I' have their ->mnt_master pointing to struct vfsmount of 'D'.
 
+   NOTE: The propagation tree is orthogonal to the mount tree.
 
-	NOTE: The propagation tree is orthogonal to the mount tree.
+B) Locking:
 
-8B Locking:
+   ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+   by namespace_sem (exclusive for modifications, shared for reading).
 
-	->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
-	by namespace_sem (exclusive for modifications, shared for reading).
+   Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+   There are two exceptions: do_add_mount() and clone_mnt().
+   The former modifies a vfsmount that has not been visible in any shared
+   data structures yet.
+   The latter holds namespace_sem and the only references to vfsmount
+   are in lists that can't be traversed without namespace_sem.
 
-	Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
-	There are two exceptions: do_add_mount() and clone_mnt().
-	The former modifies a vfsmount that has not been visible in any shared
-	data structures yet.
-	The latter holds namespace_sem and the only references to vfsmount
-	are in lists that can't be traversed without namespace_sem.
+C) Algorithm:
 
-8C Algorithm:
+   The crux of the implementation resides in rbind/move operation.
 
-	The crux of the implementation resides in rbind/move operation.
+   The overall algorithm breaks the operation into 3 phases: (look at
+   attach_recursive_mnt() and propagate_mnt())
 
-	The overall algorithm breaks the operation into 3 phases: (look at
-	attach_recursive_mnt() and propagate_mnt())
+   1. Prepare phase.
 
-	1. prepare phase.
-	2. commit phases.
-	3. abort phases.
+      For each mount in the source tree:
 
-	Prepare phase:
+      a) Create the necessary number of mount trees to
+         be attached to each of the mounts that receive
+         propagation from the destination mount.
+      b) Do not attach any of the trees to its destination.
+         However note down its ->mnt_parent and ->mnt_mountpoint
+      c) Link all the new mounts to form a propagation tree that
+         is identical to the propagation tree of the destination
+         mount.
 
-	for each mount in the source tree:
+      If this phase is successful, there should be 'n' new
+      propagation trees; where 'n' is the number of mounts in the
+      source tree.  Go to the commit phase
 
-		   a) Create the necessary number of mount trees to
-		   	be attached to each of the mounts that receive
-			propagation from the destination mount.
-		   b) Do not attach any of the trees to its destination.
-		      However note down its ->mnt_parent and ->mnt_mountpoint
-		   c) Link all the new mounts to form a propagation tree that
-		      is identical to the propagation tree of the destination
-		      mount.
+      Also there should be 'm' new mount trees, where 'm' is
+      the number of mounts to which the destination mount
+      propagates to.
 
-		   If this phase is successful, there should be 'n' new
-		   propagation trees; where 'n' is the number of mounts in the
-		   source tree.  Go to the commit phase
+      If any memory allocations fail, go to the abort phase.
 
-		   Also there should be 'm' new mount trees, where 'm' is
-		   the number of mounts to which the destination mount
-		   propagates to.
+   2. Commit phase.
 
-		   if any memory allocations fail, go to the abort phase.
+      Attach each of the mount trees to their corresponding
+      destination mounts.
 
-	Commit phase
-		attach each of the mount trees to their corresponding
-		destination mounts.
+   3. Abort phase.
 
-	Abort phase
-		delete all the newly created trees.
+      Delete all the newly created trees.
 
-	.. Note::
-	   all the propagation related functionality resides in the file pnode.c
+   .. Note::
+      all the propagation related functionality resides in the file pnode.c
 
 
 ------------------------------------------------------------------------
diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst
index be966f27f284..2703c04af7d0 100644
--- a/Documentation/filesystems/sysfs.rst
+++ b/Documentation/filesystems/sysfs.rst
@@ -243,8 +243,8 @@ Other notes:
 - show() methods should return the number of bytes printed into the
   buffer.
 
-- show() should only use sysfs_emit() or sysfs_emit_at() when formatting
-  the value to be returned to user space.
+- New implementations of show() methods should only use sysfs_emit() or
+  sysfs_emit_at() when formatting the value to be returned to user space.
 
 - store() should return the number of bytes used from the buffer. If the
   entire buffer has been used, just return the count argument.
@@ -299,7 +299,6 @@ The top level sysfs directory looks like::
     hypervisor/
     kernel/
     module/
-    net/
     power/
 
 devices/ contains a filesystem representation of the device tree. It maps
@@ -313,7 +312,7 @@ kernel. Each bus's directory contains two subdirectories::
 	drivers/
 
 devices/ contains symlinks for each device discovered in the system
-that point to the device's directory under root/.
+that point to the device's directory under /sys/devices.
 
 drivers/ contains a directory for each device driver that is loaded
 for devices on that particular bus (this assumes that drivers do not
@@ -328,15 +327,29 @@ loaded system modules, for both builtin and loadable modules.
 
 dev/ contains two directories: char/ and block/. Inside these two
 directories there are symlinks named <major>:<minor>.  These symlinks
-point to the sysfs directory for the given device.  /sys/dev provides a
+point to the directories under /sys/devices for each device.  /sys/dev provides a
 quick way to lookup the sysfs interface for a device from the result of
 a stat(2) operation.
 
 More information on driver-model specific features can be found in
 Documentation/driver-api/driver-model/.
 
+block/ contains symlinks to all the block devices discovered on the system.
+These symlinks point to directories under /sys/devices.
 
-TODO: Finish this section.
+class/ contains a directory for each device class, grouped by functional type.
+Each directory in class/ contains symlinks to devices in the /sys/devices directory.
+
+firmware/ contains system firmware data and configuration such as firmware tables,
+ACPI information, and device tree data.
+
+hypervisor/ contains virtualization platform information and provides an interface to
+the underlying hypervisor.  It is only present when running on a virtual machine.
+
+kernel/ contains runtime kernel parameters, configuration settings, and status.
+
+power/ contains power management subsystem information including
+sleep states, suspend/resume capabilities, and policies.
 
 
 Current Interfaces
diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
index e231d127cd40..8cbcd3c26434 100644
--- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
+++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst
@@ -454,7 +454,7 @@ filesystem so that it can apply pending filesystem updates to the staging
 information.
 Once the scan is done, the owning object is re-locked, the live data is used to
 write a new ondisk structure, and the repairs are committed atomically.
-The hooks are disabled and the staging staging area is freed.
+The hooks are disabled and the staging area is freed.
 Finally, the storage from the old data structure are carefully reaped.
 
 Introducing concurrency helps online repair avoid various locking problems, but
@@ -475,7 +475,7 @@ operation, which may cause application failure or an unplanned filesystem
 shutdown.
 
 Inspiration for the secondary metadata repair strategy was drawn from section
-2.4 of Srinivasan above, and sections 2 ("NSF: Inded Build Without Side-File")
+2.4 of Srinivasan above, and sections 2 ("NSF: Index Build Without Side-File")
 and 3.1.1 ("Duplicate Key Insert Problem") in C. Mohan, `"Algorithms for
 Creating Indexes for Very Large Tables Without Quiescing Updates"
 <https://dl.acm.org/doi/10.1145/130283.130337>`_, 1992.
@@ -2185,7 +2185,7 @@ The chapter about :ref:`secondary metadata<secondary_metadata>` mentioned that
 checking and repairing of secondary metadata commonly requires coordination
 between a live metadata scan of the filesystem and writer threads that are
 updating that metadata.
-Keeping the scan data up to date requires requires the ability to propagate
+Keeping the scan data up to date requires the ability to propagate
 metadata updates from the filesystem into the data being collected by the scan.
 This *can* be done by appending concurrent updates into a separate log file and
 applying them before writing the new metadata to disk, but this leads to
@@ -4179,7 +4179,7 @@ When the exchange is initiated, the sequence of operations is as follows:
    This will be discussed in more detail in subsequent sections.
 
 If the filesystem goes down in the middle of an operation, log recovery will
-find the most recent unfinished maping exchange log intent item and restart
+find the most recent unfinished mapping exchange log intent item and restart
 from there.
 This is how atomic file mapping exchanges guarantees that an outside observer
 will either see the old broken structure or the new one, and never a mismash of
diff --git a/Documentation/locking/locktypes.rst b/Documentation/locking/locktypes.rst
index 80c914f6eae7..37b6a5670c2f 100644
--- a/Documentation/locking/locktypes.rst
+++ b/Documentation/locking/locktypes.rst
@@ -204,6 +204,27 @@ per-CPU data structures on a non PREEMPT_RT kernel.
 local_lock is not suitable to protect against preemption or interrupts on a
 PREEMPT_RT kernel due to the PREEMPT_RT specific spinlock_t semantics.
 
+CPU local scope and bottom-half
+-------------------------------
+
+Per-CPU variables that are accessed only in softirq context should not rely on
+the assumption that this context is implicitly protected due to being
+non-preemptible. In a PREEMPT_RT kernel, softirq context is preemptible, and
+synchronizing every bottom-half-disabled section via implicit context results
+in an implicit per-CPU "big kernel lock."
+
+A local_lock_t together with local_lock_nested_bh() and
+local_unlock_nested_bh() for locking operations help to identify the locking
+scope.
+
+When lockdep is enabled, these functions verify that data structure access
+occurs within softirq context.
+Unlike local_lock(), local_unlock_nested_bh() does not disable preemption and
+does not add overhead when used without lockdep.
+
+On a PREEMPT_RT kernel, local_lock_t behaves as a real lock and
+local_unlock_nested_bh() serializes access to the data structure, which allows
+removal of serialization via local_bh_disable().
 
 raw_spinlock_t and spinlock_t
 =============================
diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst
index ec6411d02ac8..3fb7ea3ab22a 100644
--- a/Documentation/locking/seqlock.rst
+++ b/Documentation/locking/seqlock.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
 ======================================
 Sequence counters and sequential locks
 ======================================
diff --git a/Documentation/maintainer/configure-git.rst b/Documentation/maintainer/configure-git.rst
index 0a36831814ea..0c21f203cf7a 100644
--- a/Documentation/maintainer/configure-git.rst
+++ b/Documentation/maintainer/configure-git.rst
@@ -28,31 +28,3 @@ You may also like to tell ``gpg`` which ``tty`` to use (add to your shell
 rc file)::
 
 	export GPG_TTY=$(tty)
-
-
-Creating commit links to lore.kernel.org
-----------------------------------------
-
-The web site https://lore.kernel.org is meant as a grand archive of all mail
-list traffic concerning or influencing the kernel development. Storing archives
-of patches here is a recommended practice, and when a maintainer applies a
-patch to a subsystem tree, it is a good idea to provide a Link: tag with a
-reference back to the lore archive so that people that browse the commit
-history can find related discussions and rationale behind a certain change.
-The link tag will look like this::
-
-    Link: https://lore.kernel.org/r/<message-id>
-
-This can be configured to happen automatically any time you issue ``git am``
-by adding the following hook into your git::
-
-	$ git config am.messageid true
-	$ cat >.git/hooks/applypatch-msg <<'EOF'
-	#!/bin/sh
-	. git-sh-setup
-	perl -pi -e 's|^Message-I[dD]:\s*<?([^>]+)>?$|Link: https://lore.kernel.org/r/$1|g;' "$1"
-	test -x "$GIT_DIR/hooks/commit-msg" &&
-		exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"}
-	:
-	EOF
-	$ chmod a+x .git/hooks/applypatch-msg
diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst
index cda5d691e967..d36dd892a78a 100644
--- a/Documentation/maintainer/maintainer-entry-profile.rst
+++ b/Documentation/maintainer/maintainer-entry-profile.rst
@@ -59,6 +59,7 @@ week) that patches might be considered for merging and when patches need to
 wait for the next -rc. At a minimum:
 
 - Last -rc for new feature submissions:
+
   New feature submissions targeting the next merge window should have
   their first posting for consideration before this point. Patches that
   are submitted after this point should be clear that they are targeting
@@ -68,6 +69,7 @@ wait for the next -rc. At a minimum:
   submissions should appear before -rc5.
 
 - Last -rc to merge features: Deadline for merge decisions
+
   Indicate to contributors the point at which an as yet un-applied patch
   set will need to wait for the NEXT+1 merge window. Of course there is no
   obligation to ever accept any given patchset, but if the review has not
diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst
index 9af11b5bd145..b76183545e5b 100644
--- a/Documentation/mm/physical_memory.rst
+++ b/Documentation/mm/physical_memory.rst
@@ -171,6 +171,8 @@ nodes with particular properties as defined by ``enum node_states``:
   The node has memory(regular, high, movable)
 ``N_CPU``
   The node has one or more CPUs
+``N_GENERIC_INITIATOR``
+  The node has one or more Generic Initiators
 
 For each node that has a property described above, the bit corresponding to the
 node ID in the ``node_states[<property>]`` bitmask is set.
diff --git a/Documentation/networking/can.rst b/Documentation/networking/can.rst
index 7650c4b5be5f..f93049f03a37 100644
--- a/Documentation/networking/can.rst
+++ b/Documentation/networking/can.rst
@@ -539,7 +539,7 @@ CAN Filter Usage Optimisation
 The CAN filters are processed in per-device filter lists at CAN frame
 reception time. To reduce the number of checks that need to be performed
 while walking through the filter lists the CAN core provides an optimized
-filter handling when the filter subscription focusses on a single CAN ID.
+filter handling when the filter subscription focuses on a single CAN ID.
 
 For the possible 2048 SFF CAN identifiers the identifier is used as an index
 to access the corresponding subscription list without any further checks.
diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst
index 0b0a3eef6aae..7cfcd183054f 100644
--- a/Documentation/networking/device_drivers/ethernet/index.rst
+++ b/Documentation/networking/device_drivers/ethernet/index.rst
@@ -50,6 +50,7 @@ Contents:
    neterion/s2io
    netronome/nfp
    pensando/ionic
+   pensando/ionic_rdma
    qualcomm/ppe/ppe
    smsc/smc9
    stmicro/stmmac
diff --git a/Documentation/networking/device_drivers/ethernet/pensando/ionic.rst b/Documentation/networking/device_drivers/ethernet/pensando/ionic.rst
index 05fe2b11bb18..a0029b6db31e 100644
--- a/Documentation/networking/device_drivers/ethernet/pensando/ionic.rst
+++ b/Documentation/networking/device_drivers/ethernet/pensando/ionic.rst
@@ -13,6 +13,7 @@ Contents
 - Identifying the Adapter
 - Enabling the driver
 - Configuring the driver
+- RDMA Support via Auxiliary Device
 - Statistics
 - Support
 
@@ -105,6 +106,15 @@ XDP
 Support for XDP includes the basics, plus Jumbo frames, Redirect and
 ndo_xmit.  There is no current support for zero-copy sockets or HW offload.
 
+RDMA Support via Auxiliary Device
+=================================
+
+The ionic driver supports RDMA (Remote Direct Memory Access) functionality
+through the Linux auxiliary device framework when advertised by the firmware.
+RDMA capability is detected during device initialization, and if supported,
+the ethernet driver will create an auxiliary device that allows the RDMA
+driver to bind and provide InfiniBand/RoCE functionality.
+
 Statistics
 ==========
 
diff --git a/Documentation/networking/device_drivers/ethernet/pensando/ionic_rdma.rst b/Documentation/networking/device_drivers/ethernet/pensando/ionic_rdma.rst
new file mode 100644
index 000000000000..42eb461d5f85
--- /dev/null
+++ b/Documentation/networking/device_drivers/ethernet/pensando/ionic_rdma.rst
@@ -0,0 +1,52 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+===========================================================
+RDMA Driver for the AMD Pensando(R) Ethernet adapter family
+===========================================================
+
+AMD Pensando RDMA driver.
+Copyright (C) 2018-2025, Advanced Micro Devices, Inc.
+
+Overview
+========
+
+The ionic_rdma driver provides Remote Direct Memory Access functionality
+for AMD Pensando DSC (Distributed Services Card) devices. This driver
+implements RDMA capabilities as an auxiliary driver that operates in
+conjunction with the ionic ethernet driver.
+
+The ionic ethernet driver detects RDMA capability during device
+initialization and creates auxiliary devices that the ionic_rdma driver
+binds to, establishing the RDMA data path and control interfaces.
+
+Identifying the Adapter
+=======================
+
+See Documentation/networking/device_drivers/ethernet/pensando/ionic.rst
+for more information on identifying the adapter.
+
+Enabling the driver
+===================
+
+The ionic_rdma driver depends on the ionic ethernet driver.
+See Documentation/networking/device_drivers/ethernet/pensando/ionic.rst
+for detailed information on enabling and configuring the ionic driver.
+
+The ionic_rdma driver is enabled via the standard kernel configuration system,
+using the make command::
+
+  make oldconfig/menuconfig/etc.
+
+The driver is located in the menu structure at:
+
+  -> Device Drivers
+    -> InfiniBand support
+      -> AMD Pensando DSC RDMA/RoCE Support
+
+Support
+=======
+
+For general Linux RDMA support, please use the RDMA mailing
+list, which is monitored by AMD Pensando personnel::
+
+  linux-rdma@vger.kernel.org
diff --git a/Documentation/networking/device_drivers/ethernet/ti/am65_nuss_cpsw_switchdev.rst b/Documentation/networking/device_drivers/ethernet/ti/am65_nuss_cpsw_switchdev.rst
index 25fd9aa284e2..f0424597aac1 100644
--- a/Documentation/networking/device_drivers/ethernet/ti/am65_nuss_cpsw_switchdev.rst
+++ b/Documentation/networking/device_drivers/ethernet/ti/am65_nuss_cpsw_switchdev.rst
@@ -42,7 +42,7 @@ Port's netdev devices have to be in UP before joining to the bridge to avoid
 overwriting of bridge configuration as CPSW switch driver completely reloads its
 configuration when first port changes its state to UP.
 
-When the both interfaces joined the bridge - CPSW switch driver will enable
+When both interfaces have joined the bridge - CPSW switch driver will enable
 marking packets with offload_fwd_mark flag.
 
 All configuration is implemented via switchdev API.
diff --git a/Documentation/networking/device_drivers/ethernet/ti/cpsw_switchdev.rst b/Documentation/networking/device_drivers/ethernet/ti/cpsw_switchdev.rst
index 464dce938ed1..2f3c43a32bfc 100644
--- a/Documentation/networking/device_drivers/ethernet/ti/cpsw_switchdev.rst
+++ b/Documentation/networking/device_drivers/ethernet/ti/cpsw_switchdev.rst
@@ -92,7 +92,7 @@ Port's netdev devices have to be in UP before joining to the bridge to avoid
 overwriting of bridge configuration as CPSW switch driver copletly reloads its
 configuration when first Port changes its state to UP.
 
-When the both interfaces joined the bridge - CPSW switch driver will enable
+When both interfaces have joined the bridge - CPSW switch driver will enable
 marking packets with offload_fwd_mark flag unless "ale_bypass=0"
 
 All configuration is implemented via switchdev API.
diff --git a/Documentation/networking/rds.rst b/Documentation/networking/rds.rst
index 41b0a6182fe4..4261146e9d92 100644
--- a/Documentation/networking/rds.rst
+++ b/Documentation/networking/rds.rst
@@ -339,7 +339,7 @@ The send path
   rds_sendmsg()
     - struct rds_message built from incoming data
     - CMSGs parsed (e.g. RDMA ops)
-    - transport connection alloced and connected if not already
+    - transport connection allocated and connected if not already
     - rds_message placed on send queue
     - send worker awoken
 
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
index 9ebecb7b00b2..38e614d92a4a 100644
--- a/Documentation/power/pci.rst
+++ b/Documentation/power/pci.rst
@@ -472,7 +472,7 @@ in the device tree from the root bridge to a leaf device contains both of them).
 The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
 been called, which means that the device driver's interrupt handler won't be
 invoked while this routine is running.  It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
+implements legacy PCI suspend routines (Section 3), in which case the legacy
 late suspend routine is called and its result is returned (the standard
 configuration registers of the device are saved if the driver's callback hasn't
 done that).  Second, if the device driver's struct dev_pm_ops object is not
@@ -544,7 +544,7 @@ result is then returned).
 The resume phase is carried out asynchronously for PCI devices, like the
 suspend phase described above, which means that if two PCI devices don't depend
 on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
+both of them in parallel.
 
 The pci_pm_complete() routine only executes the device driver's pm->complete()
 callback, if defined.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
index ebedb6c75db9..641d09a6546b 100644
--- a/Documentation/power/suspend-and-cpuhotplug.rst
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -13,7 +13,7 @@ infrastructure uses it internally? And where do they share common code?
 
 Well, a picture is worth a thousand words... So ASCII art follows :-)
 
-[This depicts the current design in the kernel, and focusses only on the
+[This depicts the current design in the kernel, and focuses only on the
 interactions involving the freezer and CPU hotplug and also tries to explain
 the locking involved. It outlines the notifications involved as well.
 But please note that here, only the call paths are illustrated, with the aim
diff --git a/Documentation/process/5.Posting.rst b/Documentation/process/5.Posting.rst
index 22fa925353cf..9999bcbdccc9 100644
--- a/Documentation/process/5.Posting.rst
+++ b/Documentation/process/5.Posting.rst
@@ -207,10 +207,9 @@ document with a specification implemented by the patch::
 
 	Link: https://example.com/somewhere.html  optional-other-stuff
 
-Many maintainers when applying a patch also add this tag to link to the
-latest public review posting of the patch; often this is automatically done
-by tools like b4 or a git hook like the one described in
-'Documentation/maintainer/configure-git.rst'.
+As per guidance from the Chief Penguin, a Link: tag should only be added to
+a commit if it leads to useful information that is not found in the commit
+itself.
 
 If the URL points to a public bug report being fixed by the patch, use the
 "Closes:" tag instead::
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index b38622b0d525..62951cdb13ad 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -61,7 +61,7 @@ Sphinx\ [#f1]_         3.4.3            sphinx-build --version
 GNU tar                1.28             tar --version
 gtags (optional)       6.6.5            gtags --version
 mkimage (optional)     2017.01          mkimage --version
-Python (optional)      3.9.x            python3 --version
+Python                 3.9.x            python3 --version
 GNU AWK (optional)     5.1.0            gawk --version
 ====================== ===============  ========================================
 
@@ -154,6 +154,13 @@ Perl
 You will need perl 5 and the following modules: ``Getopt::Long``,
 ``Getopt::Std``, ``File::Basename``, and ``File::Find`` to build the kernel.
 
+Python
+------
+
+Several config options require it: it is required for arm/arm64
+default configs, CONFIG_LTO_CLANG, some DRM optional configs,
+the kernel-doc tool, and docs build (Sphinx), among others.
+
 BC
 --
 
diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index f5277993b195..b6919bf606c3 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -49,7 +49,7 @@ hosting infrastructure, regardless of how good the security practices
 for the latter may be.
 
 The above guiding principle is the reason why this guide is needed. We
-want to make sure that by placing trust into developers we do not simply
+want to make sure that by placing trust into developers we do not merely
 shift the blame for potential future security incidents to someone else.
 The goal is to provide a set of guidelines developers can use to create
 a secure working environment and safeguard the PGP keys used to
@@ -60,7 +60,7 @@ establish the integrity of the Linux kernel itself.
 PGP tools
 =========
 
-Use GnuPG 2.2 or later
+Use GnuPG 2.4 or later
 ----------------------
 
 Your distro should already have GnuPG installed by default, you just
@@ -69,9 +69,9 @@ To check, run::
 
     $ gpg --version | head -n1
 
-If you have version 2.2 or above, then you are good to go. If you have a
-version that is prior than 2.2, then some commands from this guide may
-not work.
+If you have version 2.4 or above, then you are good to go. If you have
+an earlier version, then you are using a release of GnuPG that is no
+longer maintained and some commands from this guide may not work.
 
 Configure gpg-agent options
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -199,13 +199,6 @@ separate signing subkey::
 
     $ gpg --quick-addkey [fpr] ed25519 sign
 
-.. note:: ECC support in GnuPG
-
-    Note, that if you intend to use a hardware token that does not
-    support ED25519 ECC keys, you should choose "nistp256" instead or
-    "ed25519." See the section below on recommended hardware devices.
-
-
 Back up your Certify key for disaster recovery
 ----------------------------------------------
 
@@ -213,7 +206,7 @@ The more signatures you have on your PGP key from other developers, the
 more reasons you have to create a backup version that lives on something
 other than digital media, for disaster recovery reasons.
 
-The best way to create a printable hardcopy of your private key is by
+A good way to create a printable hardcopy of your private key is by
 using the ``paperkey`` software written for this very purpose. See ``man
 paperkey`` for more details on the output format and its benefits over
 other solutions. Paperkey should already be packaged for most
@@ -224,11 +217,11 @@ key::
 
     $ gpg --export-secret-key [fpr] | paperkey -o /tmp/key-backup.txt
 
-Print out that file (or pipe the output straight to lpr), then take a
-pen and write your passphrase on the margin of the paper. **This is
-strongly recommended** because the key printout is still encrypted with
-that passphrase, and if you ever change it you will not remember what it
-used to be when you had created the backup -- *guaranteed*.
+Print out that file, then take a pen and write your passphrase on the
+margin of the paper. **This is strongly recommended** because the key
+printout is still encrypted with that passphrase, and if you ever change
+it you will not remember what it used to be when you had created the
+backup -- *guaranteed*.
 
 Put the resulting printout and the hand-written passphrase into an envelope
 and store in a secure and well-protected place, preferably away from your
@@ -236,10 +229,9 @@ home, such as your bank vault.
 
 .. note::
 
-    Your printer is probably no longer a simple dumb device connected to
-    your parallel port, but since the output is still encrypted with
-    your passphrase, printing out even to "cloud-integrated" modern
-    printers should remain a relatively safe operation.
+    The key is still encrypted with your passphrase, so printing out
+    even to "cloud-integrated" modern printers should remain a
+    relatively safe operation.
 
 Back up your whole GnuPG directory
 ----------------------------------
@@ -255,16 +247,17 @@ on these external copies whenever you need to use your Certify key --
 such as when making changes to your own key or signing other people's
 keys after conferences and summits.
 
-Start by getting a small USB "thumb" drive (preferably two!) that you
-will use for backup purposes. You will need to encrypt them using LUKS
--- refer to your distro's documentation on how to accomplish this.
+Start by getting an external media card (preferably two!) that you will
+use for backup purposes. You will need to create an encrypted partition
+on this device using LUKS -- refer to your distro's documentation on how
+to accomplish this.
 
 For the encryption passphrase, you can use the same one as on your
 PGP key.
 
-Once the encryption process is over, re-insert the USB drive and make
-sure it gets properly mounted. Copy your entire ``.gnupg`` directory
-over to the encrypted storage::
+Once the encryption process is over, re-insert your device and make sure
+it gets properly mounted. Copy your entire ``.gnupg`` directory over to
+the encrypted storage::
 
     $ cp -a ~/.gnupg /media/disk/foo/gnupg-backup
 
@@ -273,11 +266,10 @@ You should now test to make sure everything still works::
     $ gpg --homedir=/media/disk/foo/gnupg-backup --list-key [fpr]
 
 If you don't get any errors, then you should be good to go. Unmount the
-USB drive, distinctly label it so you don't blow it away next time you
-need to use a random USB drive, and put in a safe place -- but not too
-far away, because you'll need to use it every now and again for things
-like editing identities, adding or revoking subkeys, or signing other
-people's keys.
+device, distinctly label it so you don't overwrite it by accident, and
+put in a safe place -- but not too far away, because you'll need to use
+it every now and again for things like editing identities, adding or
+revoking subkeys, or signing other people's keys.
 
 Remove the Certify key from your homedir
 ----------------------------------------
@@ -303,7 +295,7 @@ and store it on offline storage.
     your GnuPG directory in its entirety. What we are about to do will
     render your key useless if you do not have a usable backup!
 
-First, identify the keygrip of your Certify key::
+First, identify the "keygrip" of your Certify key::
 
     $ gpg --with-keygrip --list-key [fpr]
 
@@ -328,8 +320,8 @@ Certify key fingerprint). This will correspond directly to a file in your
     2222000000000000000000000000000000000000.key
     3333000000000000000000000000000000000000.key
 
-All you have to do is simply remove the .key file that corresponds to
-the Certify key keygrip::
+It is sufficient to remove the .key file that corresponds to the Certify
+key keygrip::
 
     $ cd ~/.gnupg/private-keys-v1.d
     $ rm 1111000000000000000000000000000000000000.key
@@ -372,7 +364,7 @@ GnuPG operation is performed, the keys are loaded into system memory and
 can be stolen from there by sufficiently advanced malware (think
 Meltdown and Spectre).
 
-The best way to completely protect your keys is to move them to a
+A good way to completely protect your keys is to move them to a
 specialized hardware device that is capable of smartcard operations.
 
 The benefits of smartcards
@@ -383,11 +375,11 @@ private keys and performing crypto operations directly on the card
 itself. Because the key contents never leave the smartcard, the
 operating system of the computer into which you plug in the hardware
 device is not able to retrieve the private keys themselves. This is very
-different from the encrypted USB storage device we used earlier for
-backup purposes -- while that USB device is plugged in and mounted, the
+different from the encrypted media storage device we used earlier for
+backup purposes -- while that device is plugged in and mounted, the
 operating system is able to access the private key contents.
 
-Using external encrypted USB media is not a substitute to having a
+Using external encrypted media is not a substitute to having a
 smartcard-capable device.
 
 Available smartcard devices
@@ -398,17 +390,15 @@ easiest is to get a specialized USB device that implements smartcard
 functionality. There are several options available:
 
 - `Nitrokey Start`_: Open hardware and Free Software, based on FSI
-  Japan's `Gnuk`_. One of the few available commercial devices that
-  support ED25519 ECC keys, but offer fewest security features (such as
-  resistance to tampering or some side-channel attacks).
-- `Nitrokey Pro 2`_: Similar to the Nitrokey Start, but more
-  tamper-resistant and offers more security features. Pro 2 supports ECC
-  cryptography (NISTP).
+  Japan's `Gnuk`_. One of the cheapest options, but offers fewest
+  security features (such as resistance to tampering or some
+  side-channel attacks).
+- `Nitrokey 3`_: Similar to the Nitrokey Start, but more
+  tamper-resistant and offers more security features and USB
+  form-factors. Supports ECC cryptography (ED25519 and NISTP).
 - `Yubikey 5`_: proprietary hardware and software, but cheaper than
-  Nitrokey Pro and comes available in the USB-C form that is more useful
-  with newer laptops. Offers additional security features such as FIDO
-  U2F, among others, and now finally supports NISTP and ED25519 ECC
-  keys.
+  Nitrokey with a similar set of features. Supports ECC cryptography
+  (ED25519 and NISTP).
 
 Your choice will depend on cost, shipping availability in your
 geographical region, and open/proprietary hardware considerations.
@@ -419,8 +409,8 @@ geographical region, and open/proprietary hardware considerations.
     you `qualify for a free Nitrokey Start`_ courtesy of The Linux
     Foundation.
 
-.. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6
-.. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nkpr2-nitrokey-pro-2-3
+.. _`Nitrokey Start`: https://www.nitrokey.com/products/nitrokeys
+.. _`Nitrokey 3`: https://www.nitrokey.com/products/nitrokeys
 .. _`Yubikey 5`: https://www.yubico.com/products/yubikey-5-overview/
 .. _Gnuk: https://www.fsij.org/doc-gnuk/
 .. _`qualify for a free Nitrokey Start`: https://www.kernel.org/nitrokey-digital-tokens-for-kernel-developers.html
@@ -455,7 +445,7 @@ the smartcard). You so rarely need to use the Admin PIN, that you will
 inevitably forget what it is if you do not record it.
 
 Getting back to the main card menu, you can also set other values (such
-as name, sex, login data, etc), but it's not necessary and will
+as name, gender, login data, etc), but it's not necessary and will
 additionally leak information about your smartcard should you lose it.
 
 .. note::
@@ -615,7 +605,7 @@ run::
 You can also use a specific date if that is easier to remember (e.g.
 your birthday, January 1st, or Canada Day)::
 
-    $ gpg --quick-set-expire [fpr] 2025-07-01
+    $ gpg --quick-set-expire [fpr] 2038-07-01
 
 Remember to send the updated key back to keyservers::
 
@@ -656,9 +646,9 @@ hundreds of cloned repositories floating around, how does anyone verify
 that their copy of linux.git has not been tampered with by a malicious
 third party?
 
-Or what happens if a backdoor is discovered in the code and the "Author"
-line in the commit says it was done by you, while you're pretty sure you
-had `nothing to do with it`_?
+Or what happens if malicious code is discovered in the kernel and the
+"Author" line in the commit says it was done by you, while you're pretty
+sure you had `nothing to do with it`_?
 
 To address both of these issues, Git introduced PGP integration. Signed
 tags prove the repository integrity by assuring that its contents are
@@ -681,8 +671,7 @@ should be used (``[fpr]`` is the fingerprint of your key)::
 How to work with signed tags
 ----------------------------
 
-To create a signed tag, simply pass the ``-s`` switch to the tag
-command::
+To create a signed tag, pass the ``-s`` switch to the tag command::
 
     $ git tag -s [tagname]
 
@@ -693,7 +682,7 @@ not been maliciously altered.
 How to verify signed tags
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To verify a signed tag, simply use the ``verify-tag`` command::
+To verify a signed tag, use the ``verify-tag`` command::
 
     $ git verify-tag [tagname]
 
@@ -712,9 +701,9 @@ The merge message will contain something like this::
     # gpg: Signature made [...]
     # gpg: Good signature from [...]
 
-If you are verifying someone else's git tag, then you will need to
-import their PGP key. Please refer to the
-":ref:`verify_identities`" section below.
+If you are verifying someone else's git tag, you will first need to
+import their PGP key. Please refer to the ":ref:`verify_identities`"
+section below.
 
 Configure git to always sign annotated tags
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -728,16 +717,16 @@ configuration option::
 How to work with signed commits
 -------------------------------
 
-It is easy to create signed commits, but it is much more difficult to
-use them in Linux kernel development, since it relies on patches sent to
-the mailing list, and this workflow does not preserve PGP commit
-signatures. Furthermore, when rebasing your repository to match
-upstream, even your own PGP commit signatures will end up discarded. For
-this reason, most kernel developers don't bother signing their commits
-and will ignore signed commits in any external repositories that they
-rely upon in their work.
+It is also possible to create signed commits, but they have limited
+usefulness in Linux kernel development. The kernel contribution workflow
+relies on sending in patches, and converting commits to patches does not
+preserve git commit signatures. Furthermore, when rebasing your own
+repository on a newer upstream, PGP commit signatures will end up
+discarded. For this reason, most kernel developers don't bother signing
+their commits and will ignore signed commits in any external
+repositories that they rely upon in their work.
 
-However, if you have your working git tree publicly available at some
+That said, if you have your working git tree publicly available at some
 git hosting service (kernel.org, infradead.org, ozlabs.org, or others),
 then the recommendation is that you sign all your git commits even if
 upstream developers do not directly benefit from this practice.
@@ -748,7 +737,7 @@ We recommend this for the following reasons:
    provenance, even externally maintained trees carrying PGP commit
    signatures will be valuable for such purposes.
 2. If you ever need to re-clone your local repository (for example,
-   after a disk failure), this lets you easily verify the repository
+   after reinstalling your system), this lets you verify the repository
    integrity before resuming your work.
 3. If someone needs to cherry-pick your commits, this allows them to
    quickly verify their integrity before applying them.
@@ -756,9 +745,8 @@ We recommend this for the following reasons:
 Creating signed commits
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-To create a signed commit, you just need to pass the ``-S`` flag to the
-``git commit`` command (it's capital ``-S`` due to collision with
-another flag)::
+To create a signed commit, pass the ``-S`` flag to the ``git commit``
+command (it's capital ``-S`` due to collision with another flag)::
 
     $ git commit -S
 
@@ -775,7 +763,6 @@ You can tell git to always sign commits::
 
 .. _verify_identities:
 
-
 How to work with signed patches
 -------------------------------
 
@@ -793,6 +780,11 @@ headers (a-la DKIM):
 Installing and configuring patatt
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+
+    If you use B4 to send in your patches, patatt is already installed
+    and integrated into your workflow.
+
 Patatt is packaged for many distributions already, so please check there
 first. You can also install it from pypi using "``pip install patatt``".
 
@@ -835,9 +827,9 @@ encounters, for example::
 How to verify kernel developer identities
 =========================================
 
-Signing tags and commits is easy, but how does one go about verifying
-that the key used to sign something belongs to the actual kernel
-developer and not to a malicious imposter?
+Signing tags and commits is straightforward, but how does one go about
+verifying that the key used to sign something belongs to the actual
+kernel developer and not to a malicious imposter?
 
 Configure auto-key-retrieval using WKD and DANE
 -----------------------------------------------
@@ -884,7 +876,7 @@ various software makers dictating who should be your trusted certifying
 entity, PGP leaves this responsibility to each user.
 
 Unfortunately, very few people understand how the Web of Trust works.
-While it remains an important aspect of the OpenPGP specification,
+While it is still an important part of the OpenPGP specification,
 recent versions of GnuPG (2.2 and above) have implemented an alternative
 mechanism called "Trust on First Use" (TOFU). You can think of TOFU as
 "the SSH-like approach to trust." With SSH, the first time you connect
@@ -894,8 +886,8 @@ to connect, forcing you to make a decision on whether you choose to
 trust the changed key or not. Similarly, the first time you import
 someone's PGP key, it is assumed to be valid. If at any point in the
 future GnuPG comes across another key with the same identity, both the
-previously imported key and the new key will be marked as invalid and
-you will need to manually figure out which one to keep.
+previously imported key and the new key will be marked for verification
+and you will need to manually figure out which one to keep.
 
 We recommend that you use the combined TOFU+PGP trust model (which is
 the new default in GnuPG v2). To set it, add (or modify) the
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index cede4e7b29af..910e8fc9e3c8 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -343,7 +343,7 @@ https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
 As is frequently quoted on the mailing list::
 
   A: http://en.wikipedia.org/wiki/Top_post
-  Q: Were do I find info about this thing called top-posting?
+  Q: Where do I find info about this thing called top-posting?
   A: Because it messes up the order in which people normally read text.
   Q: Why is top-posting such a bad thing?
   A: Top-posting.
@@ -602,8 +602,8 @@ future. Note, this is one of only three tags you might be able to use without
 explicit permission of the person named (see 'Tagging people requires
 permission' below for details).
 
-A Fixes: tag indicates that the patch fixes an issue in a previous commit. It
-is used to make it easy to determine where a bug originated, which can help
+A Fixes: tag indicates that the patch fixes a bug in a previous commit. It
+is used to make it easy to determine where an issue originated, which can help
 review a bug fix. This tag also assists the stable kernel team in determining
 which stable kernel versions should receive your fix. This is the preferred
 method for indicating a bug fixed by the patch. See :ref:`describe_changes`
diff --git a/Documentation/sphinx/automarkup.py b/Documentation/sphinx/automarkup.py
index 563033f764bb..1d9dada40a74 100644
--- a/Documentation/sphinx/automarkup.py
+++ b/Documentation/sphinx/automarkup.py
@@ -244,7 +244,7 @@ def add_and_resolve_xref(app, docname, domain, reftype, target, contnode=None):
     return contnode
 
 #
-# Variant of markup_abi_ref() that warns whan a reference is not found
+# Variant of markup_abi_ref() that warns when a reference is not found
 #
 def markup_abi_file_ref(docname, app, match):
     return markup_abi_ref(docname, app, match, warning=True)
diff --git a/Documentation/sphinx/cdomain.py b/Documentation/sphinx/cdomain.py
deleted file mode 100644
index 3dc285dc70f5..000000000000
--- a/Documentation/sphinx/cdomain.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# -*- coding: utf-8; mode: python -*-
-# SPDX-License-Identifier: GPL-2.0
-# pylint: disable=W0141,C0113,C0103,C0325
-"""
-    cdomain
-    ~~~~~~~
-
-    Replacement for the sphinx c-domain.
-
-    :copyright:  Copyright (C) 2016  Markus Heiser
-    :license:    GPL Version 2, June 1991 see Linux/COPYING for details.
-
-    List of customizations:
-
-    * Moved the *duplicate C object description* warnings for function
-      declarations in the nitpicky mode. See Sphinx documentation for
-      the config values for ``nitpick`` and ``nitpick_ignore``.
-
-    * Add option 'name' to the "c:function:" directive.  With option 'name' the
-      ref-name of a function can be modified. E.g.::
-
-          .. c:function:: int ioctl( int fd, int request )
-             :name: VIDIOC_LOG_STATUS
-
-      The func-name (e.g. ioctl) remains in the output but the ref-name changed
-      from 'ioctl' to 'VIDIOC_LOG_STATUS'. The function is referenced by::
-
-          * :c:func:`VIDIOC_LOG_STATUS` or
-          * :any:`VIDIOC_LOG_STATUS` (``:any:`` needs sphinx 1.3)
-
-     * Handle signatures of function-like macros well. Don't try to deduce
-       arguments types of function-like macros.
-
-"""
-
-from docutils import nodes
-from docutils.parsers.rst import directives
-
-import sphinx
-from sphinx import addnodes
-from sphinx.domains.c import c_funcptr_sig_re, c_sig_re
-from sphinx.domains.c import CObject as Base_CObject
-from sphinx.domains.c import CDomain as Base_CDomain
-from itertools import chain
-import re
-
-__version__  = '1.1'
-
-# Namespace to be prepended to the full name
-namespace = None
-
-#
-# Handle trivial newer c domain tags that are part of Sphinx 3.1 c domain tags
-# - Store the namespace if ".. c:namespace::" tag is found
-#
-RE_namespace = re.compile(r'^\s*..\s*c:namespace::\s*(\S+)\s*$')
-
-def markup_namespace(match):
-    global namespace
-
-    namespace = match.group(1)
-
-    return ""
-
-#
-# Handle c:macro for function-style declaration
-#
-RE_macro = re.compile(r'^\s*..\s*c:macro::\s*(\S+)\s+(\S.*)\s*$')
-def markup_macro(match):
-    return ".. c:function:: " + match.group(1) + ' ' + match.group(2)
-
-#
-# Handle newer c domain tags that are evaluated as .. c:type: for
-# backward-compatibility with Sphinx < 3.0
-#
-RE_ctype = re.compile(r'^\s*..\s*c:(struct|union|enum|enumerator|alias)::\s*(.*)$')
-
-def markup_ctype(match):
-    return ".. c:type:: " + match.group(2)
-
-#
-# Handle newer c domain tags that are evaluated as :c:type: for
-# backward-compatibility with Sphinx < 3.0
-#
-RE_ctype_refs = re.compile(r':c:(var|struct|union|enum|enumerator)::`([^\`]+)`')
-def markup_ctype_refs(match):
-    return ":c:type:`" + match.group(2) + '`'
-
-#
-# Simply convert :c:expr: and :c:texpr: into a literal block.
-#
-RE_expr = re.compile(r':c:(expr|texpr):`([^\`]+)`')
-def markup_c_expr(match):
-    return '\\ ``' + match.group(2) + '``\\ '
-
-#
-# Parse Sphinx 3.x C markups, replacing them by backward-compatible ones
-#
-def c_markups(app, docname, source):
-    result = ""
-    markup_func = {
-        RE_namespace: markup_namespace,
-        RE_expr: markup_c_expr,
-        RE_macro: markup_macro,
-        RE_ctype: markup_ctype,
-        RE_ctype_refs: markup_ctype_refs,
-    }
-
-    lines = iter(source[0].splitlines(True))
-    for n in lines:
-        match_iterators = [regex.finditer(n) for regex in markup_func]
-        matches = sorted(chain(*match_iterators), key=lambda m: m.start())
-        for m in matches:
-            n = n[:m.start()] + markup_func[m.re](m) + n[m.end():]
-
-        result = result + n
-
-    source[0] = result
-
-#
-# Now implements support for the cdomain namespacing logic
-#
-
-def setup(app):
-
-    # Handle easy Sphinx 3.1+ simple new tags: :c:expr and .. c:namespace::
-    app.connect('source-read', c_markups)
-    app.add_domain(CDomain, override=True)
-
-    return dict(
-        version = __version__,
-        parallel_read_safe = True,
-        parallel_write_safe = True
-    )
-
-class CObject(Base_CObject):
-
-    """
-    Description of a C language object.
-    """
-    option_spec = {
-        "name" : directives.unchanged
-    }
-
-    def handle_func_like_macro(self, sig, signode):
-        """Handles signatures of function-like macros.
-
-        If the objtype is 'function' and the signature ``sig`` is a
-        function-like macro, the name of the macro is returned. Otherwise
-        ``False`` is returned.  """
-
-        global namespace
-
-        if not self.objtype == 'function':
-            return False
-
-        m = c_funcptr_sig_re.match(sig)
-        if m is None:
-            m = c_sig_re.match(sig)
-            if m is None:
-                raise ValueError('no match')
-
-        rettype, fullname, arglist, _const = m.groups()
-        arglist = arglist.strip()
-        if rettype or not arglist:
-            return False
-
-        arglist = arglist.replace('`', '').replace('\\ ', '') # remove markup
-        arglist = [a.strip() for a in arglist.split(",")]
-
-        # has the first argument a type?
-        if len(arglist[0].split(" ")) > 1:
-            return False
-
-        # This is a function-like macro, its arguments are typeless!
-        signode  += addnodes.desc_name(fullname, fullname)
-        paramlist = addnodes.desc_parameterlist()
-        signode  += paramlist
-
-        for argname in arglist:
-            param = addnodes.desc_parameter('', '', noemph=True)
-            # separate by non-breaking space in the output
-            param += nodes.emphasis(argname, argname)
-            paramlist += param
-
-        if namespace:
-            fullname = namespace + "." + fullname
-
-        return fullname
-
-    def handle_signature(self, sig, signode):
-        """Transform a C signature into RST nodes."""
-
-        global namespace
-
-        fullname = self.handle_func_like_macro(sig, signode)
-        if not fullname:
-            fullname = super(CObject, self).handle_signature(sig, signode)
-
-        if "name" in self.options:
-            if self.objtype == 'function':
-                fullname = self.options["name"]
-            else:
-                # FIXME: handle :name: value of other declaration types?
-                pass
-        else:
-            if namespace:
-                fullname = namespace + "." + fullname
-
-        return fullname
-
-    def add_target_and_index(self, name, sig, signode):
-        # for C API items we add a prefix since names are usually not qualified
-        # by a module name and so easily clash with e.g. section titles
-        targetname = 'c.' + name
-        if targetname not in self.state.document.ids:
-            signode['names'].append(targetname)
-            signode['ids'].append(targetname)
-            signode['first'] = (not self.names)
-            self.state.document.note_explicit_target(signode)
-            inv = self.env.domaindata['c']['objects']
-            if (name in inv and self.env.config.nitpicky):
-                if self.objtype == 'function':
-                    if ('c:func', name) not in self.env.config.nitpick_ignore:
-                        self.state_machine.reporter.warning(
-                            'duplicate C object description of %s, ' % name +
-                            'other instance in ' + self.env.doc2path(inv[name][0]),
-                            line=self.lineno)
-            inv[name] = (self.env.docname, self.objtype)
-
-        indextext = self.get_index_text(name)
-        if indextext:
-            self.indexnode['entries'].append(
-                    ('single', indextext, targetname, '', None))
-
-class CDomain(Base_CDomain):
-
-    """C language domain."""
-    name = 'c'
-    label = 'C'
-    directives = {
-        'function': CObject,
-        'member':   CObject,
-        'macro':    CObject,
-        'type':     CObject,
-        'var':      CObject,
-    }
diff --git a/Documentation/sphinx/kernel_feat.py b/Documentation/sphinx/kernel_feat.py
index e3a51867f27b..aaac76892ceb 100644
--- a/Documentation/sphinx/kernel_feat.py
+++ b/Documentation/sphinx/kernel_feat.py
@@ -40,9 +40,11 @@ import sys
 from docutils import nodes, statemachine
 from docutils.statemachine import ViewList
 from docutils.parsers.rst import directives, Directive
-from docutils.utils.error_reporting import ErrorString
 from sphinx.util.docutils import switch_source_input
 
+def ErrorString(exc):  # Shamelessly stolen from docutils
+    return f'{exc.__class__.__name}: {exc}'
+
 __version__  = '1.0'
 
 def setup(app):
diff --git a/Documentation/sphinx/kernel_include.py b/Documentation/sphinx/kernel_include.py
index 1e566e87ebcd..f94412cd17c9 100755
--- a/Documentation/sphinx/kernel_include.py
+++ b/Documentation/sphinx/kernel_include.py
@@ -1,31 +1,82 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8; mode: python -*-
 # SPDX-License-Identifier: GPL-2.0
-# pylint: disable=R0903, C0330, R0914, R0912, E0401
+# pylint: disable=R0903, R0912, R0914, R0915, C0209,W0707
+
 
 """
-    kernel-include
-    ~~~~~~~~~~~~~~
+Implementation of the ``kernel-include`` reST-directive.
+
+:copyright:  Copyright (C) 2016  Markus Heiser
+:license:    GPL Version 2, June 1991 see linux/COPYING for details.
+
+The ``kernel-include`` reST-directive is a replacement for the ``include``
+directive. The ``kernel-include`` directive expand environment variables in
+the path name and allows to include files from arbitrary locations.
+
+.. hint::
+
+    Including files from arbitrary locations (e.g. from ``/etc``) is a
+    security risk for builders. This is why the ``include`` directive from
+    docutils *prohibit* pathnames pointing to locations *above* the filesystem
+    tree where the reST document with the include directive is placed.
+
+Substrings of the form $name or ${name} are replaced by the value of
+environment variable name. Malformed variable names and references to
+non-existing variables are left unchanged.
+
+**Supported Sphinx Include Options**:
+
+:param literal:
+    If present, the included file is inserted as a literal block.
+
+:param code:
+    Specify the language for syntax highlighting (e.g., 'c', 'python').
+
+:param encoding:
+    Specify the encoding of the included file (default: 'utf-8').
+
+:param tab-width:
+    Specify the number of spaces that a tab represents.
+
+:param start-line:
+    Line number at which to start including the file (1-based).
+
+:param end-line:
+    Line number at which to stop including the file (inclusive).
+
+:param start-after:
+    Include lines after the first line matching this text.
+
+:param end-before:
+    Include lines before the first line matching this text.
 
-    Implementation of the ``kernel-include`` reST-directive.
+:param number-lines:
+    Number the included lines (integer specifies start number).
+    Only effective with 'literal' or 'code' options.
 
-    :copyright:  Copyright (C) 2016  Markus Heiser
-    :license:    GPL Version 2, June 1991 see linux/COPYING for details.
+:param class:
+    Specify HTML class attribute for the included content.
 
-    The ``kernel-include`` reST-directive is a replacement for the ``include``
-    directive. The ``kernel-include`` directive expand environment variables in
-    the path name and allows to include files from arbitrary locations.
+**Kernel-specific Extensions**:
 
-    .. hint::
+:param generate-cross-refs:
+    If present, instead of directly including the file, it calls
+    ParseDataStructs() to convert C data structures into cross-references
+    that link to comprehensive documentation in other ReST files.
 
-      Including files from arbitrary locations (e.g. from ``/etc``) is a
-      security risk for builders. This is why the ``include`` directive from
-      docutils *prohibit* pathnames pointing to locations *above* the filesystem
-      tree where the reST document with the include directive is placed.
+:param exception-file:
+    (Used with generate-cross-refs)
 
-    Substrings of the form $name or ${name} are replaced by the value of
-    environment variable name. Malformed variable names and references to
-    non-existing variables are left unchanged.
+    Path to a file containing rules for handling special cases:
+    - Ignore specific C data structures
+    - Use alternative reference names
+    - Specify different reference types
+
+:param warn-broken:
+    (Used with generate-cross-refs)
+
+    Enables warnings when auto-generated cross-references don't point to
+    existing documentation targets.
 """
 
 # ==============================================================================
@@ -33,161 +84,366 @@
 # ==============================================================================
 
 import os.path
+import re
+import sys
 
 from docutils import io, nodes, statemachine
-from docutils.utils.error_reporting import SafeString, ErrorString
-from docutils.parsers.rst import directives
+from docutils.statemachine import ViewList
+from docutils.parsers.rst import Directive, directives
 from docutils.parsers.rst.directives.body import CodeBlock, NumberLines
-from docutils.parsers.rst.directives.misc import Include
 
-__version__  = '1.0'
+from sphinx.util import logging
 
-# ==============================================================================
-def setup(app):
-# ==============================================================================
+srctree = os.path.abspath(os.environ["srctree"])
+sys.path.insert(0, os.path.join(srctree, "tools/docs/lib"))
 
-    app.add_directive("kernel-include", KernelInclude)
-    return dict(
-        version = __version__,
-        parallel_read_safe = True,
-        parallel_write_safe = True
-    )
+from parse_data_structs import ParseDataStructs
 
-# ==============================================================================
-class KernelInclude(Include):
-# ==============================================================================
+__version__ = "1.0"
+logger = logging.getLogger(__name__)
 
-    """KernelInclude (``kernel-include``) directive"""
+RE_DOMAIN_REF = re.compile(r'\\ :(ref|c:type|c:func):`([^<`]+)(?:<([^>]+)>)?`\\')
+RE_SIMPLE_REF = re.compile(r'`([^`]+)`')
 
-    def run(self):
-        env = self.state.document.settings.env
-        path = os.path.realpath(
-            os.path.expandvars(self.arguments[0]))
+def ErrorString(exc):  # Shamelessly stolen from docutils
+    return f'{exc.__class__.__name}: {exc}'
 
-        # to get a bit security back, prohibit /etc:
-        if path.startswith(os.sep + "etc"):
-            raise self.severe(
-                'Problems with "%s" directive, prohibited path: %s'
-                % (self.name, path))
 
-        self.arguments[0] = path
+# ==============================================================================
+class KernelInclude(Directive):
+    """
+    KernelInclude (``kernel-include``) directive
 
-        env.note_dependency(os.path.abspath(path))
+    Most of the stuff here came from Include directive defined at:
+        docutils/parsers/rst/directives/misc.py
 
-        #return super(KernelInclude, self).run() # won't work, see HINTs in _run()
-        return self._run()
+    Yet, overriding the class don't has any benefits: the original class
+    only have run() and argument list. Not all of them are implemented,
+    when checked against latest Sphinx version, as with time more arguments
+    were added.
 
-    def _run(self):
-        """Include a file as part of the content of this reST file."""
+    So, keep its own list of supported arguments
+    """
 
-        # HINT: I had to copy&paste the whole Include.run method. I'am not happy
-        # with this, but due to security reasons, the Include.run method does
-        # not allow absolute or relative pathnames pointing to locations *above*
-        # the filesystem tree where the reST document is placed.
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec = {
+        'literal': directives.flag,
+        'code': directives.unchanged,
+        'encoding': directives.encoding,
+        'tab-width': int,
+        'start-line': int,
+        'end-line': int,
+        'start-after': directives.unchanged_required,
+        'end-before': directives.unchanged_required,
+        # ignored except for 'literal' or 'code':
+        'number-lines': directives.unchanged,  # integer or None
+        'class': directives.class_option,
 
-        if not self.state.document.settings.file_insertion_enabled:
-            raise self.warning('"%s" directive disabled.' % self.name)
-        source = self.state_machine.input_lines.source(
-            self.lineno - self.state_machine.input_offset - 1)
-        source_dir = os.path.dirname(os.path.abspath(source))
-        path = directives.path(self.arguments[0])
-        if path.startswith('<') and path.endswith('>'):
-            path = os.path.join(self.standard_include_path, path[1:-1])
-        path = os.path.normpath(os.path.join(source_dir, path))
+        # Arguments that aren't from Sphinx Include directive
+        'generate-cross-refs': directives.flag,
+        'warn-broken': directives.flag,
+        'toc': directives.flag,
+        'exception-file': directives.unchanged,
+    }
 
-        # HINT: this is the only line I had to change / commented out:
-        #path = utils.relative_path(None, path)
+    def read_rawtext(self, path, encoding):
+            """Read and process file content with error handling"""
+            try:
+                self.state.document.settings.record_dependencies.add(path)
+                include_file = io.FileInput(source_path=path,
+                                            encoding=encoding,
+                                            error_handler=self.state.document.settings.input_encoding_error_handler)
+            except UnicodeEncodeError:
+                raise self.severe('Problems with directive path:\n'
+                                'Cannot encode input file path "%s" '
+                                '(wrong locale?).' % path)
+            except IOError as error:
+                raise self.severe('Problems with directive path:\n%s.' % ErrorString(error))
 
-        encoding = self.options.get(
-            'encoding', self.state.document.settings.input_encoding)
-        e_handler=self.state.document.settings.input_encoding_error_handler
-        tab_width = self.options.get(
-            'tab-width', self.state.document.settings.tab_width)
-        try:
-            self.state.document.settings.record_dependencies.add(path)
-            include_file = io.FileInput(source_path=path,
-                                        encoding=encoding,
-                                        error_handler=e_handler)
-        except UnicodeEncodeError as error:
-            raise self.severe('Problems with "%s" directive path:\n'
-                              'Cannot encode input file path "%s" '
-                              '(wrong locale?).' %
-                              (self.name, SafeString(path)))
-        except IOError as error:
-            raise self.severe('Problems with "%s" directive path:\n%s.' %
-                      (self.name, ErrorString(error)))
+            try:
+                return include_file.read()
+            except UnicodeError as error:
+                raise self.severe('Problem with directive:\n%s' % ErrorString(error))
+
+    def apply_range(self, rawtext):
+        """
+        Handles start-line, end-line, start-after and end-before parameters
+        """
+
+        # Get to-be-included content
         startline = self.options.get('start-line', None)
         endline = self.options.get('end-line', None)
         try:
             if startline or (endline is not None):
-                lines = include_file.readlines()
-                rawtext = ''.join(lines[startline:endline])
-            else:
-                rawtext = include_file.read()
+                lines = rawtext.splitlines()
+                rawtext = '\n'.join(lines[startline:endline])
         except UnicodeError as error:
-            raise self.severe('Problem with "%s" directive:\n%s' %
-                              (self.name, ErrorString(error)))
+            raise self.severe(f'Problem with "{self.name}" directive:\n'
+                              + io.error_string(error))
         # start-after/end-before: no restrictions on newlines in match-text,
         # and no restrictions on matching inside lines vs. line boundaries
-        after_text = self.options.get('start-after', None)
+        after_text = self.options.get("start-after", None)
         if after_text:
             # skip content in rawtext before *and incl.* a matching text
             after_index = rawtext.find(after_text)
             if after_index < 0:
                 raise self.severe('Problem with "start-after" option of "%s" '
-                                  'directive:\nText not found.' % self.name)
-            rawtext = rawtext[after_index + len(after_text):]
-        before_text = self.options.get('end-before', None)
+                                  "directive:\nText not found." % self.name)
+            rawtext = rawtext[after_index + len(after_text) :]
+        before_text = self.options.get("end-before", None)
         if before_text:
             # skip content in rawtext after *and incl.* a matching text
             before_index = rawtext.find(before_text)
             if before_index < 0:
                 raise self.severe('Problem with "end-before" option of "%s" '
-                                  'directive:\nText not found.' % self.name)
+                                  "directive:\nText not found." % self.name)
             rawtext = rawtext[:before_index]
 
+        return rawtext
+
+    def xref_text(self, env, path, tab_width):
+        """
+        Read and add contents from a C file parsed to have cross references.
+
+        There are two types of supported output here:
+        - A C source code with cross-references;
+        - a TOC table containing cross references.
+        """
+        parser = ParseDataStructs()
+        parser.parse_file(path)
+
+        if 'exception-file' in self.options:
+            source_dir = os.path.dirname(os.path.abspath(
+                self.state_machine.input_lines.source(
+                    self.lineno - self.state_machine.input_offset - 1)))
+            exceptions_file = os.path.join(source_dir, self.options['exception-file'])
+            parser.process_exceptions(exceptions_file)
+
+        # Store references on a symbol dict to be used at check time
+        if 'warn-broken' in self.options:
+            env._xref_files.add(path)
+
+        if "toc" not in self.options:
+
+            rawtext = ".. parsed-literal::\n\n" + parser.gen_output()
+            self.apply_range(rawtext)
+
+            include_lines = statemachine.string2lines(rawtext, tab_width,
+                                                      convert_whitespace=True)
+
+            # Sphinx always blame the ".. <directive>", so placing
+            # line numbers here won't make any difference
+
+            self.state_machine.insert_input(include_lines, path)
+            return []
+
+        # TOC output is a ReST file, not a literal. So, we can add line
+        # numbers
+
+        rawtext = parser.gen_toc()
+
         include_lines = statemachine.string2lines(rawtext, tab_width,
                                                   convert_whitespace=True)
-        if 'literal' in self.options:
-            # Convert tabs to spaces, if `tab_width` is positive.
-            if tab_width >= 0:
-                text = rawtext.expandtabs(tab_width)
-            else:
-                text = rawtext
-            literal_block = nodes.literal_block(rawtext, source=path,
-                                    classes=self.options.get('class', []))
-            literal_block.line = 1
-            self.add_name(literal_block)
-            if 'number-lines' in self.options:
-                try:
-                    startline = int(self.options['number-lines'] or 1)
-                except ValueError:
-                    raise self.error(':number-lines: with non-integer '
-                                     'start value')
-                endline = startline + len(include_lines)
-                if text.endswith('\n'):
-                    text = text[:-1]
-                tokens = NumberLines([([], text)], startline, endline)
-                for classes, value in tokens:
-                    if classes:
-                        literal_block += nodes.inline(value, value,
-                                                      classes=classes)
-                    else:
-                        literal_block += nodes.Text(value, value)
-            else:
-                literal_block += nodes.Text(text, text)
-            return [literal_block]
-        if 'code' in self.options:
-            self.options['source'] = path
-            codeblock = CodeBlock(self.name,
-                                  [self.options.pop('code')], # arguments
-                                  self.options,
-                                  include_lines, # content
-                                  self.lineno,
-                                  self.content_offset,
-                                  self.block_text,
-                                  self.state,
-                                  self.state_machine)
-            return codeblock.run()
-        self.state_machine.insert_input(include_lines, path)
+
+        # Append line numbers data
+
+        startline = self.options.get('start-line', None)
+
+        result = ViewList()
+        if startline and startline > 0:
+            offset = startline - 1
+        else:
+            offset = 0
+
+        for ln, line in enumerate(include_lines, start=offset):
+            result.append(line, path, ln)
+
+        self.state_machine.insert_input(result, path)
+
         return []
+
+    def literal(self, path, tab_width, rawtext):
+        """Output a literal block"""
+
+        # Convert tabs to spaces, if `tab_width` is positive.
+        if tab_width >= 0:
+            text = rawtext.expandtabs(tab_width)
+        else:
+            text = rawtext
+        literal_block = nodes.literal_block(rawtext, source=path,
+                                            classes=self.options.get("class", []))
+        literal_block.line = 1
+        self.add_name(literal_block)
+        if "number-lines" in self.options:
+            try:
+                startline = int(self.options["number-lines"] or 1)
+            except ValueError:
+                raise self.error(":number-lines: with non-integer start value")
+            endline = startline + len(include_lines)
+            if text.endswith("\n"):
+                text = text[:-1]
+            tokens = NumberLines([([], text)], startline, endline)
+            for classes, value in tokens:
+                if classes:
+                    literal_block += nodes.inline(value, value,
+                                                    classes=classes)
+                else:
+                    literal_block += nodes.Text(value, value)
+        else:
+            literal_block += nodes.Text(text, text)
+        return [literal_block]
+
+    def code(self, path, tab_width):
+        """Output a code block"""
+
+        include_lines = statemachine.string2lines(rawtext, tab_width,
+                                                  convert_whitespace=True)
+
+        self.options["source"] = path
+        codeblock = CodeBlock(self.name,
+                                [self.options.pop("code")],  # arguments
+                                self.options,
+                                include_lines,
+                                self.lineno,
+                                self.content_offset,
+                                self.block_text,
+                                self.state,
+                                self.state_machine)
+        return codeblock.run()
+
+    def run(self):
+        """Include a file as part of the content of this reST file."""
+        env = self.state.document.settings.env
+
+        #
+        # The include logic accepts only patches relative to the
+        # Kernel source tree.  The logic does check it to prevent
+        # directory traverse issues.
+        #
+
+        srctree = os.path.abspath(os.environ["srctree"])
+
+        path = os.path.expandvars(self.arguments[0])
+        src_path = os.path.join(srctree, path)
+
+        if os.path.isfile(src_path):
+            base = srctree
+            path = src_path
+        else:
+            raise self.warning(f'File "%s" doesn\'t exist', path)
+
+        abs_base = os.path.abspath(base)
+        abs_full_path = os.path.abspath(os.path.join(base, path))
+
+        try:
+            if os.path.commonpath([abs_full_path, abs_base]) != abs_base:
+                raise self.severe('Problems with "%s" directive, prohibited path: %s' %
+                                  (self.name, path))
+        except ValueError:
+            # Paths don't have the same drive (Windows) or other incompatibility
+            raise self.severe('Problems with "%s" directive, invalid path: %s' %
+                            (self.name, path))
+
+        self.arguments[0] = path
+
+        #
+        # Add path location to Sphinx dependencies to ensure proper cache
+        # invalidation check.
+        #
+
+        env.note_dependency(os.path.abspath(path))
+
+        if not self.state.document.settings.file_insertion_enabled:
+            raise self.warning('"%s" directive disabled.' % self.name)
+        source = self.state_machine.input_lines.source(self.lineno -
+                                                       self.state_machine.input_offset - 1)
+        source_dir = os.path.dirname(os.path.abspath(source))
+        path = directives.path(self.arguments[0])
+        if path.startswith("<") and path.endswith(">"):
+            path = os.path.join(self.standard_include_path, path[1:-1])
+        path = os.path.normpath(os.path.join(source_dir, path))
+
+        # HINT: this is the only line I had to change / commented out:
+        # path = utils.relative_path(None, path)
+
+        encoding = self.options.get("encoding",
+                                    self.state.document.settings.input_encoding)
+        tab_width = self.options.get("tab-width",
+                                     self.state.document.settings.tab_width)
+
+        # Get optional arguments to related to cross-references generation
+        if "generate-cross-refs" in self.options:
+            return self.xref_text(env, path, tab_width)
+
+        rawtext = self.read_rawtext(path, encoding)
+        rawtext = self.apply_range(rawtext)
+
+        if "code" in self.options:
+            return self.code(path, tab_width, rawtext)
+
+        return self.literal(path, tab_width, rawtext)
+
+# ==============================================================================
+
+reported = set()
+
+def check_missing_refs(app, env, node, contnode):
+    """Check broken refs for the files it creates xrefs"""
+    if not node.source:
+        return None
+
+    try:
+        xref_files = env._xref_files
+    except AttributeError:
+        logger.critical("FATAL: _xref_files not initialized!")
+        raise
+
+    # Only show missing references for kernel-include reference-parsed files
+    if node.source not in xref_files:
+        return None
+
+    target = node.get('reftarget', '')
+    domain = node.get('refdomain', 'std')
+    reftype = node.get('reftype', '')
+
+    msg = f"can't link to: {domain}:{reftype}:: {target}"
+
+    # Don't duplicate warnings
+    data = (node.source, msg)
+    if data in reported:
+        return None
+    reported.add(data)
+
+    logger.warning(msg, location=node, type='ref', subtype='missing')
+
+    return None
+
+def merge_xref_info(app, env, docnames, other):
+    """
+    As each process modify env._xref_files, we need to merge them back.
+    """
+    if not hasattr(other, "_xref_files"):
+        return
+    env._xref_files.update(getattr(other, "_xref_files", set()))
+
+def init_xref_docs(app, env, docnames):
+    """Initialize a list of files that we're generating cross references¨"""
+    app.env._xref_files = set()
+
+# ==============================================================================
+
+def setup(app):
+    """Setup Sphinx exension"""
+
+    app.connect("env-before-read-docs", init_xref_docs)
+    app.connect("env-merge-info", merge_xref_info)
+    app.add_directive("kernel-include", KernelInclude)
+    app.connect("missing-reference", check_missing_refs)
+
+    return {
+        "version": __version__,
+        "parallel_read_safe": True,
+        "parallel_write_safe": True,
+    }
diff --git a/Documentation/sphinx/maintainers_include.py b/Documentation/sphinx/maintainers_include.py
index d31cff867436..519ad18685b2 100755
--- a/Documentation/sphinx/maintainers_include.py
+++ b/Documentation/sphinx/maintainers_include.py
@@ -22,10 +22,12 @@ import re
 import os.path
 
 from docutils import statemachine
-from docutils.utils.error_reporting import ErrorString
 from docutils.parsers.rst import Directive
 from docutils.parsers.rst.directives.misc import Include
 
+def ErrorString(exc):  # Shamelessly stolen from docutils
+    return f'{exc.__class__.__name}: {exc}'
+
 __version__  = '1.0'
 
 def setup(app):
diff --git a/Documentation/sphinx/parse-headers.pl b/Documentation/sphinx/parse-headers.pl
deleted file mode 100755
index 7b1458544e2e..000000000000
--- a/Documentation/sphinx/parse-headers.pl
+++ /dev/null
@@ -1,404 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-# Copyright (c) 2016 by Mauro Carvalho Chehab <mchehab@kernel.org>.
-
-use strict;
-use Text::Tabs;
-use Getopt::Long;
-use Pod::Usage;
-
-my $debug;
-my $help;
-my $man;
-
-GetOptions(
-	"debug" => \$debug,
-	'usage|?' => \$help,
-	'help' => \$man
-) or pod2usage(2);
-
-pod2usage(1) if $help;
-pod2usage(-exitstatus => 0, -verbose => 2) if $man;
-pod2usage(2) if (scalar @ARGV < 2 || scalar @ARGV > 3);
-
-my ($file_in, $file_out, $file_exceptions) = @ARGV;
-
-my $data;
-my %ioctls;
-my %defines;
-my %typedefs;
-my %enums;
-my %enum_symbols;
-my %structs;
-
-require Data::Dumper if ($debug);
-
-#
-# read the file and get identifiers
-#
-
-my $is_enum = 0;
-my $is_comment = 0;
-open IN, $file_in or die "Can't open $file_in";
-while (<IN>) {
-	$data .= $_;
-
-	my $ln = $_;
-	if (!$is_comment) {
-		$ln =~ s,/\*.*(\*/),,g;
-
-		$is_comment = 1 if ($ln =~ s,/\*.*,,);
-	} else {
-		if ($ln =~ s,^(.*\*/),,) {
-			$is_comment = 0;
-		} else {
-			next;
-		}
-	}
-
-	if ($is_enum && $ln =~ m/^\s*([_\w][\w\d_]+)\s*[\,=]?/) {
-		my $s = $1;
-		my $n = $1;
-		$n =~ tr/A-Z/a-z/;
-		$n =~ tr/_/-/;
-
-		$enum_symbols{$s} =  "\\ :ref:`$s <$n>`\\ ";
-
-		$is_enum = 0 if ($is_enum && m/\}/);
-		next;
-	}
-	$is_enum = 0 if ($is_enum && m/\}/);
-
-	if ($ln =~ m/^\s*#\s*define\s+([_\w][\w\d_]+)\s+_IO/) {
-		my $s = $1;
-		my $n = $1;
-		$n =~ tr/A-Z/a-z/;
-
-		$ioctls{$s} = "\\ :ref:`$s <$n>`\\ ";
-		next;
-	}
-
-	if ($ln =~ m/^\s*#\s*define\s+([_\w][\w\d_]+)\s+/) {
-		my $s = $1;
-		my $n = $1;
-		$n =~ tr/A-Z/a-z/;
-		$n =~ tr/_/-/;
-
-		$defines{$s} = "\\ :ref:`$s <$n>`\\ ";
-		next;
-	}
-
-	if ($ln =~ m/^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);/) {
-		my $s = $2;
-		my $n = $3;
-
-		$typedefs{$n} = "\\ :c:type:`$n <$s>`\\ ";
-		next;
-	}
-	if ($ln =~ m/^\s*enum\s+([_\w][\w\d_]+)\s+\{/
-	    || $ln =~ m/^\s*enum\s+([_\w][\w\d_]+)$/
-	    || $ln =~ m/^\s*typedef\s*enum\s+([_\w][\w\d_]+)\s+\{/
-	    || $ln =~ m/^\s*typedef\s*enum\s+([_\w][\w\d_]+)$/) {
-		my $s = $1;
-
-		$enums{$s} =  "enum :c:type:`$s`\\ ";
-
-		$is_enum = $1;
-		next;
-	}
-	if ($ln =~ m/^\s*struct\s+([_\w][\w\d_]+)\s+\{/
-	    || $ln =~ m/^\s*struct\s+([[_\w][\w\d_]+)$/
-	    || $ln =~ m/^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s+\{/
-	    || $ln =~ m/^\s*typedef\s*struct\s+([[_\w][\w\d_]+)$/
-	    ) {
-		my $s = $1;
-
-		$structs{$s} = "struct $s\\ ";
-		next;
-	}
-}
-close IN;
-
-#
-# Handle multi-line typedefs
-#
-
-my @matches = ($data =~ m/typedef\s+struct\s+\S+?\s*\{[^\}]+\}\s*(\S+)\s*\;/g,
-	       $data =~ m/typedef\s+enum\s+\S+?\s*\{[^\}]+\}\s*(\S+)\s*\;/g,);
-foreach my $m (@matches) {
-	my $s = $m;
-
-	$typedefs{$s} = "\\ :c:type:`$s`\\ ";
-	next;
-}
-
-#
-# Handle exceptions, if any
-#
-
-my %def_reftype = (
-	"ioctl"   => ":ref",
-	"define"  => ":ref",
-	"symbol"  => ":ref",
-	"typedef" => ":c:type",
-	"enum"    => ":c:type",
-	"struct"  => ":c:type",
-);
-
-if ($file_exceptions) {
-	open IN, $file_exceptions or die "Can't read $file_exceptions";
-	while (<IN>) {
-		next if (m/^\s*$/ || m/^\s*#/);
-
-		# Parsers to ignore a symbol
-
-		if (m/^ignore\s+ioctl\s+(\S+)/) {
-			delete $ioctls{$1} if (exists($ioctls{$1}));
-			next;
-		}
-		if (m/^ignore\s+define\s+(\S+)/) {
-			delete $defines{$1} if (exists($defines{$1}));
-			next;
-		}
-		if (m/^ignore\s+typedef\s+(\S+)/) {
-			delete $typedefs{$1} if (exists($typedefs{$1}));
-			next;
-		}
-		if (m/^ignore\s+enum\s+(\S+)/) {
-			delete $enums{$1} if (exists($enums{$1}));
-			next;
-		}
-		if (m/^ignore\s+struct\s+(\S+)/) {
-			delete $structs{$1} if (exists($structs{$1}));
-			next;
-		}
-		if (m/^ignore\s+symbol\s+(\S+)/) {
-			delete $enum_symbols{$1} if (exists($enum_symbols{$1}));
-			next;
-		}
-
-		# Parsers to replace a symbol
-		my ($type, $old, $new, $reftype);
-
-		if (m/^replace\s+(\S+)\s+(\S+)\s+(\S+)/) {
-			$type = $1;
-			$old = $2;
-			$new = $3;
-		} else {
-			die "Can't parse $file_exceptions: $_";
-		}
-
-		if ($new =~ m/^\:c\:(data|func|macro|type)\:\`(.+)\`/) {
-			$reftype = ":c:$1";
-			$new = $2;
-		} elsif ($new =~ m/\:ref\:\`(.+)\`/) {
-			$reftype = ":ref";
-			$new = $1;
-		} else {
-			$reftype = $def_reftype{$type};
-		}
-		$new = "$reftype:`$old <$new>`";
-
-		if ($type eq "ioctl") {
-			$ioctls{$old} = $new if (exists($ioctls{$old}));
-			next;
-		}
-		if ($type eq "define") {
-			$defines{$old} = $new if (exists($defines{$old}));
-			next;
-		}
-		if ($type eq "symbol") {
-			$enum_symbols{$old} = $new if (exists($enum_symbols{$old}));
-			next;
-		}
-		if ($type eq "typedef") {
-			$typedefs{$old} = $new if (exists($typedefs{$old}));
-			next;
-		}
-		if ($type eq "enum") {
-			$enums{$old} = $new if (exists($enums{$old}));
-			next;
-		}
-		if ($type eq "struct") {
-			$structs{$old} = $new if (exists($structs{$old}));
-			next;
-		}
-
-		die "Can't parse $file_exceptions: $_";
-	}
-}
-
-if ($debug) {
-	print Data::Dumper->Dump([\%ioctls], [qw(*ioctls)]) if (%ioctls);
-	print Data::Dumper->Dump([\%typedefs], [qw(*typedefs)]) if (%typedefs);
-	print Data::Dumper->Dump([\%enums], [qw(*enums)]) if (%enums);
-	print Data::Dumper->Dump([\%structs], [qw(*structs)]) if (%structs);
-	print Data::Dumper->Dump([\%defines], [qw(*defines)]) if (%defines);
-	print Data::Dumper->Dump([\%enum_symbols], [qw(*enum_symbols)]) if (%enum_symbols);
-}
-
-#
-# Align block
-#
-$data = expand($data);
-$data = "    " . $data;
-$data =~ s/\n/\n    /g;
-$data =~ s/\n\s+$/\n/g;
-$data =~ s/\n\s+\n/\n\n/g;
-
-#
-# Add escape codes for special characters
-#
-$data =~ s,([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^]),\\$1,g;
-
-$data =~ s,DEPRECATED,**DEPRECATED**,g;
-
-#
-# Add references
-#
-
-my $start_delim = "[ \n\t\(\=\*\@]";
-my $end_delim = "(\\s|,|\\\\=|\\\\:|\\;|\\\)|\\}|\\{)";
-
-foreach my $r (keys %ioctls) {
-	my $s = $ioctls{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-
-	$data =~ s/($start_delim)($r)$end_delim/$1$s$3/g;
-}
-
-foreach my $r (keys %defines) {
-	my $s = $defines{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-
-	$data =~ s/($start_delim)($r)$end_delim/$1$s$3/g;
-}
-
-foreach my $r (keys %enum_symbols) {
-	my $s = $enum_symbols{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-
-	$data =~ s/($start_delim)($r)$end_delim/$1$s$3/g;
-}
-
-foreach my $r (keys %enums) {
-	my $s = $enums{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-
-	$data =~ s/enum\s+($r)$end_delim/$s$2/g;
-}
-
-foreach my $r (keys %structs) {
-	my $s = $structs{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-
-	$data =~ s/struct\s+($r)$end_delim/$s$2/g;
-}
-
-foreach my $r (keys %typedefs) {
-	my $s = $typedefs{$r};
-
-	$r =~ s,([\_\`\*\<\>\&\\\\:\/]),\\\\$1,g;
-
-	print "$r -> $s\n" if ($debug);
-	$data =~ s/($start_delim)($r)$end_delim/$1$s$3/g;
-}
-
-$data =~ s/\\ ([\n\s])/\1/g;
-
-#
-# Generate output file
-#
-
-my $title = $file_in;
-$title =~ s,.*/,,;
-
-open OUT, "> $file_out" or die "Can't open $file_out";
-print OUT ".. -*- coding: utf-8; mode: rst -*-\n\n";
-print OUT "$title\n";
-print OUT "=" x length($title);
-print OUT "\n\n.. parsed-literal::\n\n";
-print OUT $data;
-close OUT;
-
-__END__
-
-=head1 NAME
-
-parse_headers.pl - parse a C file, in order to identify functions, structs,
-enums and defines and create cross-references to a Sphinx book.
-
-=head1 SYNOPSIS
-
-B<parse_headers.pl> [<options>] <C_FILE> <OUT_FILE> [<EXCEPTIONS_FILE>]
-
-Where <options> can be: --debug, --help or --usage.
-
-=head1 OPTIONS
-
-=over 8
-
-=item B<--debug>
-
-Put the script in verbose mode, useful for debugging.
-
-=item B<--usage>
-
-Prints a brief help message and exits.
-
-=item B<--help>
-
-Prints a more detailed help message and exits.
-
-=back
-
-=head1 DESCRIPTION
-
-Convert a C header or source file (C_FILE), into a ReStructured Text
-included via ..parsed-literal block with cross-references for the
-documentation files that describe the API. It accepts an optional
-EXCEPTIONS_FILE with describes what elements will be either ignored or
-be pointed to a non-default reference.
-
-The output is written at the (OUT_FILE).
-
-It is capable of identifying defines, functions, structs, typedefs,
-enums and enum symbols and create cross-references for all of them.
-It is also capable of distinguish #define used for specifying a Linux
-ioctl.
-
-The EXCEPTIONS_FILE contain two rules to allow ignoring a symbol or
-to replace the default references by a custom one.
-
-Please read Documentation/doc-guide/parse-headers.rst at the Kernel's
-tree for more details.
-
-=head1 BUGS
-
-Report bugs to Mauro Carvalho Chehab <mchehab@kernel.org>
-
-=head1 COPYRIGHT
-
-Copyright (c) 2016 by Mauro Carvalho Chehab <mchehab@kernel.org>.
-
-License GPLv2: GNU GPL version 2 <https://gnu.org/licenses/gpl.html>.
-
-This is free software: you are free to change and redistribute it.
-There is NO WARRANTY, to the extent permitted by law.
-
-=cut
diff --git a/Documentation/sphinx/templates/kernel-toc.html b/Documentation/sphinx/templates/kernel-toc.html
index 41f1efbe64bb..b84969bd31c4 100644
--- a/Documentation/sphinx/templates/kernel-toc.html
+++ b/Documentation/sphinx/templates/kernel-toc.html
@@ -1,4 +1,5 @@
-<!-- SPDX-License-Identifier: GPL-2.0 -->
+{# SPDX-License-Identifier: GPL-2.0 #}
+
 {# Create a local TOC the kernel way #}
 <p>
 <h3 class="kernel-toc-contents">Contents</h3>
diff --git a/Documentation/sphinx/templates/translations.html b/Documentation/sphinx/templates/translations.html
index 8df5d42d8dcd..351586f41938 100644
--- a/Documentation/sphinx/templates/translations.html
+++ b/Documentation/sphinx/templates/translations.html
@@ -1,5 +1,5 @@
-<!-- SPDX-License-Identifier: GPL-2.0 -->
-<!-- Copyright © 2023, Oracle and/or its affiliates. -->
+{# SPDX-License-Identifier: GPL-2.0 #}
+{# Copyright © 2023, Oracle and/or its affiliates. #}
 
 {# Create a language menu for translations #}
 {% if languages|length > 0: %}
diff --git a/Documentation/staging/remoteproc.rst b/Documentation/staging/remoteproc.rst
index 348ee7e508ac..5c226fa076d6 100644
--- a/Documentation/staging/remoteproc.rst
+++ b/Documentation/staging/remoteproc.rst
@@ -104,7 +104,7 @@ Typical usage
 	rproc_shutdown(my_rproc);
   }
 
-API for implementors
+API for implementers
 ====================
 
 ::
diff --git a/Documentation/trace/boottime-trace.rst b/Documentation/trace/boottime-trace.rst
index 3efac10adb36..651f3a2c01de 100644
--- a/Documentation/trace/boottime-trace.rst
+++ b/Documentation/trace/boottime-trace.rst
@@ -19,7 +19,7 @@ this uses bootconfig file to describe tracing feature programming.
 Options in the Boot Config
 ==========================
 
-Here is the list of available options list for boot time tracing in
+Here is the list of available options for boot time tracing in
 boot config file [1]_. All options are under "ftrace." or "kernel."
 prefix. See kernel parameters for the options which starts
 with "kernel." prefix [2]_.
diff --git a/Documentation/trace/debugging.rst b/Documentation/trace/debugging.rst
index d54bc500af80..4d88c346fc38 100644
--- a/Documentation/trace/debugging.rst
+++ b/Documentation/trace/debugging.rst
@@ -59,7 +59,7 @@ There is various methods of acquiring the state of the system when a kernel
 crash occurs. This could be from the oops message in printk, or one could
 use kexec/kdump. But these just show what happened at the time of the crash.
 It can be very useful in knowing what happened up to the point of the crash.
-The tracing ring buffer, by default, is a circular buffer than will
+The tracing ring buffer, by default, is a circular buffer that will
 overwrite older events with newer ones. When a crash happens, the content of
 the ring buffer will be all the events that lead up to the crash.
 
diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index 2d88a2acacc0..18d112963dec 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -629,8 +629,8 @@ following:
   - tracing synthetic events from in-kernel code
   - the low-level "dynevent_cmd" API
 
-7.1 Dyamically creating synthetic event definitions
----------------------------------------------------
+7.1 Dynamically creating synthetic event definitions
+----------------------------------------------------
 
 There are a couple ways to create a new synthetic event from a kernel
 module or other kernel code.
@@ -944,8 +944,8 @@ Note that synth_event_trace_end() must be called at the end regardless
 of whether any of the add calls failed (say due to a bad field name
 being passed in).
 
-7.3 Dyamically creating kprobe and kretprobe event definitions
---------------------------------------------------------------
+7.3 Dynamically creating kprobe and kretprobe event definitions
+---------------------------------------------------------------
 
 To create a kprobe or kretprobe trace event from kernel code, the
 kprobe_event_gen_cmd_start() or kretprobe_event_gen_cmd_start()
diff --git a/Documentation/trace/fprobe.rst b/Documentation/trace/fprobe.rst
index 71cd40472d36..06b0edad0179 100644
--- a/Documentation/trace/fprobe.rst
+++ b/Documentation/trace/fprobe.rst
@@ -81,7 +81,7 @@ Same as ftrace, the registered callbacks will start being called some time
 after the register_fprobe() is called and before it returns. See
 :file:`Documentation/trace/ftrace.rst`.
 
-Also, the unregister_fprobe() will guarantee that the both enter and exit
+Also, the unregister_fprobe() will guarantee that both enter and exit
 handlers are no longer being called by functions after unregister_fprobe()
 returns as same as unregister_ftrace_function().
 
diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index e198854ace79..e225cc46b71e 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -193,7 +193,7 @@ FTRACE_OPS_FL_RECURSION
 	Note, if this flag is set, then the callback will always be called
 	with preemption disabled. If it is not set, then it is possible
 	(but not guaranteed) that the callback will be called in
-	preemptable context.
+	preemptible context.
 
 FTRACE_OPS_FL_IPMODIFY
 	Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack"
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index af66a05e18cc..aef674df3afd 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -30,7 +30,7 @@ disabled and enabled, as well as for preemption and from a time
 a task is woken to the task is actually scheduled in.
 
 One of the most common uses of ftrace is the event tracing.
-Throughout the kernel is hundreds of static event points that
+Throughout the kernel are hundreds of static event points that
 can be enabled via the tracefs file system to see what is
 going on in certain parts of the kernel.
 
@@ -383,7 +383,7 @@ of ftrace. Here is a list of some of the key files:
 	not be listed in this count.
 
 	If the callback registered to be traced by a function with
-	the "save regs" attribute (thus even more overhead), a 'R'
+	the "save regs" attribute (thus even more overhead), an 'R'
 	will be displayed on the same line as the function that
 	is returning registers.
 
@@ -392,7 +392,7 @@ of ftrace. Here is a list of some of the key files:
 	an 'I' will be displayed on the same line as the function that
 	can be overridden.
 
-	If a non ftrace trampoline is attached (BPF) a 'D' will be displayed.
+	If a non-ftrace trampoline is attached (BPF) a 'D' will be displayed.
 	Note, normal ftrace trampolines can also be attached, but only one
 	"direct" trampoline can be attached to a given function at a time.
 
@@ -402,7 +402,7 @@ of ftrace. Here is a list of some of the key files:
 
 	If a function had either the "ip modify" or a "direct" call attached to
 	it in the past, a 'M' will be shown. This flag is never cleared. It is
-	used to know if a function was every modified by the ftrace infrastructure,
+	used to know if a function was ever modified by the ftrace infrastructure,
 	and can be used for debugging.
 
 	If the architecture supports it, it will also show what callback
@@ -418,7 +418,7 @@ of ftrace. Here is a list of some of the key files:
 
 	This file contains all the functions that ever had a function callback
 	to it via the ftrace infrastructure. It has the same format as
-	enabled_functions but shows all functions that have every been
+	enabled_functions but shows all functions that have ever been
 	traced.
 
 	To see any function that has every been modified by "ip modify" or a
@@ -517,7 +517,7 @@ of ftrace. Here is a list of some of the key files:
 	Whenever an event is recorded into the ring buffer, a
 	"timestamp" is added. This stamp comes from a specified
 	clock. By default, ftrace uses the "local" clock. This
-	clock is very fast and strictly per cpu, but on some
+	clock is very fast and strictly per CPU, but on some
 	systems it may not be monotonic with respect to other
 	CPUs. In other words, the local clocks may not be in sync
 	with local clocks on other CPUs.
@@ -868,7 +868,7 @@ Here is the list of current tracers that may be configured.
 
   "mmiotrace"
 
-	A special tracer that is used to trace binary module.
+	A special tracer that is used to trace binary modules.
 	It will trace all the calls that a module makes to the
 	hardware. Everything it writes and reads from the I/O
 	as well.
diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst
index 5765eb3e9efa..ae71b5bf97c6 100644
--- a/Documentation/trace/histogram-design.rst
+++ b/Documentation/trace/histogram-design.rst
@@ -11,13 +11,14 @@ histograms work and how the individual pieces map to the data
 structures used to implement them in trace_events_hist.c and
 tracing_map.c.
 
-Note: All the ftrace histogram command examples assume the working
-      directory is the ftrace /tracing directory. For example::
+.. note::
+   All the ftrace histogram command examples assume the working
+   directory is the ftrace /tracing directory. For example::
 
 	# cd /sys/kernel/tracing
 
-Also, the histogram output displayed for those commands will be
-generally be truncated - only enough to make the point is displayed.
+   Also, the histogram output displayed for those commands will be
+   generally be truncated - only enough to make the point is displayed.
 
 'hist_debug' trace event files
 ==============================
@@ -142,30 +143,30 @@ elements for a couple hypothetical keys and values.::
                              +--------------+                            |  |
                                             n_keys = n_fields - n_vals   |  |
 
-The hist_data n_vals and n_fields delineate the extent of the fields[]   |  |
-array and separate keys from values for the rest of the code.            |  |
-
-Below is a run-time representation of the tracing_map part of the        |  |
-histogram, with pointers from various parts of the fields[] array        |  |
-to corresponding parts of the tracing_map.                               |  |
-
-The tracing_map consists of an array of tracing_map_entrys and a set     |  |
-of preallocated tracing_map_elts (abbreviated below as map_entry and     |  |
-map_elt).  The total number of map_entrys in the hist_data.map array =   |  |
-map->max_elts (actually map->map_size but only max_elts of those are     |  |
-used.  This is a property required by the map_insert() algorithm).       |  |
-
-If a map_entry is unused, meaning no key has yet hashed into it, its     |  |
-.key value is 0 and its .val pointer is NULL.  Once a map_entry has      |  |
-been claimed, the .key value contains the key's hash value and the       |  |
-.val member points to a map_elt containing the full key and an entry     |  |
-for each key or value in the map_elt.fields[] array.  There is an        |  |
-entry in the map_elt.fields[] array corresponding to each hist_field     |  |
-in the histogram, and this is where the continually aggregated sums      |  |
-corresponding to each histogram value are kept.                          |  |
-
-The diagram attempts to show the relationship between the                |  |
-hist_data.fields[] and the map_elt.fields[] with the links drawn         |  |
+The hist_data n_vals and n_fields delineate the extent of the fields[]
+array and separate keys from values for the rest of the code.
+
+Below is a run-time representation of the tracing_map part of the
+histogram, with pointers from various parts of the fields[] array
+to corresponding parts of the tracing_map.
+
+The tracing_map consists of an array of tracing_map_entrys and a set
+of preallocated tracing_map_elts (abbreviated below as map_entry and
+map_elt).  The total number of map_entrys in the hist_data.map array =
+map->max_elts (actually map->map_size but only max_elts of those are
+used.  This is a property required by the map_insert() algorithm).
+
+If a map_entry is unused, meaning no key has yet hashed into it, its
+.key value is 0 and its .val pointer is NULL.  Once a map_entry has
+been claimed, the .key value contains the key's hash value and the
+.val member points to a map_elt containing the full key and an entry
+for each key or value in the map_elt.fields[] array.  There is an
+entry in the map_elt.fields[] array corresponding to each hist_field
+in the histogram, and this is where the continually aggregated sums
+corresponding to each histogram value are kept.
+
+The diagram attempts to show the relationship between the
+hist_data.fields[] and the map_elt.fields[] with the links drawn
 between diagrams::
 
   +-----------+		                                                 |  |
@@ -380,7 +381,9 @@ entry, ts0, corresponding to the ts0 variable in the sched_waking
 trigger above.
 
 sched_waking histogram
-----------------------::
+----------------------
+
+.. code-block::
 
   +------------------+
   | hist_data        |<-------------------------------------------------------+
@@ -440,31 +443,31 @@ sched_waking histogram
                                              n_keys = n_fields - n_vals   | | |
                                                                           | | |
 
-This is very similar to the basic case.  In the above diagram, we can     | | |
-see a new .flags member has been added to the struct hist_field           | | |
-struct, and a new entry added to hist_data.fields representing the ts0    | | |
-variable.  For a normal val hist_field, .flags is just 0 (modulo          | | |
-modifier flags), but if the value is defined as a variable, the .flags    | | |
-contains a set FL_VAR bit.                                                | | |
-
-As you can see, the ts0 entry's .var.idx member contains the index        | | |
-into the tracing_map_elts' .vars[] array containing variable values.      | | |
-This idx is used whenever the value of the variable is set or read.       | | |
-The map_elt.vars idx assigned to the given variable is assigned and       | | |
-saved in .var.idx by create_tracing_map_fields() after it calls           | | |
-tracing_map_add_var().                                                    | | |
-
-Below is a representation of the histogram at run-time, which             | | |
-populates the map, along with correspondence to the above hist_data and   | | |
-hist_field data structures.                                               | | |
-
-The diagram attempts to show the relationship between the                 | | |
-hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with       | | |
-the links drawn between diagrams.  For each of the map_elts, you can      | | |
-see that the .fields[] members point to the .sum or .offset of a key      | | |
-or val and the .vars[] members point to the value of a variable.  The     | | |
-arrows between the two diagrams show the linkages between those           | | |
-tracing_map members and the field definitions in the corresponding        | | |
+This is very similar to the basic case.  In the above diagram, we can
+see a new .flags member has been added to the struct hist_field
+struct, and a new entry added to hist_data.fields representing the ts0
+variable.  For a normal val hist_field, .flags is just 0 (modulo
+modifier flags), but if the value is defined as a variable, the .flags
+contains a set FL_VAR bit.
+
+As you can see, the ts0 entry's .var.idx member contains the index
+into the tracing_map_elts' .vars[] array containing variable values.
+This idx is used whenever the value of the variable is set or read.
+The map_elt.vars idx assigned to the given variable is assigned and
+saved in .var.idx by create_tracing_map_fields() after it calls
+tracing_map_add_var().
+
+Below is a representation of the histogram at run-time, which
+populates the map, along with correspondence to the above hist_data and
+hist_field data structures.
+
+The diagram attempts to show the relationship between the
+hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with
+the links drawn between diagrams.  For each of the map_elts, you can
+see that the .fields[] members point to the .sum or .offset of a key
+or val and the .vars[] members point to the value of a variable.  The
+arrows between the two diagrams show the linkages between those
+tracing_map members and the field definitions in the corresponding
 hist_data fields[] members.::
 
   +-----------+		                                                  | | |
@@ -565,40 +568,40 @@ hist_data fields[] members.::
                                                       |               |     | |
                                                       +---------------+     | |
 
-For each used map entry, there's a map_elt pointing to an array of          | |
-.vars containing the current value of the variables associated with         | |
-that histogram entry.  So in the above, the timestamp associated with       | |
-pid 999 is 113345679876, and the timestamp variable in the same             | |
-.var.idx for pid 4444 is 213499240729.                                      | |
-
-sched_switch histogram                                                      | |
-----------------------                                                      | |
-
-The sched_switch histogram paired with the above sched_waking               | |
-histogram is shown below.  The most important aspect of the                 | |
-sched_switch histogram is that it references a variable on the              | |
-sched_waking histogram above.                                               | |
-
-The histogram diagram is very similar to the others so far displayed,       | |
-but it adds variable references.  You can see the normal hitcount and       | |
-key fields along with a new wakeup_lat variable implemented in the          | |
-same way as the sched_waking ts0 variable, but in addition there's an       | |
-entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag.       | |
-
-Associated with the new var ref field are a couple of new hist_field        | |
-members, var.hist_data and var_ref_idx.  For a variable reference, the      | |
-var.hist_data goes with the var.idx, which together uniquely identify       | |
-a particular variable on a particular histogram.  The var_ref_idx is        | |
-just the index into the var_ref_vals[] array that caches the values of      | |
-each variable whenever a hist trigger is updated.  Those resulting          | |
-values are then finally accessed by other code such as trace action         | |
-code that uses the var_ref_idx values to assign param values.               | |
-
-The diagram below describes the situation for the sched_switch              | |
+For each used map entry, there's a map_elt pointing to an array of
+.vars containing the current value of the variables associated with
+that histogram entry.  So in the above, the timestamp associated with
+pid 999 is 113345679876, and the timestamp variable in the same
+.var.idx for pid 4444 is 213499240729.
+
+sched_switch histogram
+----------------------
+
+The sched_switch histogram paired with the above sched_waking
+histogram is shown below.  The most important aspect of the
+sched_switch histogram is that it references a variable on the
+sched_waking histogram above.
+
+The histogram diagram is very similar to the others so far displayed,
+but it adds variable references.  You can see the normal hitcount and
+key fields along with a new wakeup_lat variable implemented in the
+same way as the sched_waking ts0 variable, but in addition there's an
+entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag.
+
+Associated with the new var ref field are a couple of new hist_field
+members, var.hist_data and var_ref_idx.  For a variable reference, the
+var.hist_data goes with the var.idx, which together uniquely identify
+a particular variable on a particular histogram.  The var_ref_idx is
+just the index into the var_ref_vals[] array that caches the values of
+each variable whenever a hist trigger is updated.  Those resulting
+values are then finally accessed by other code such as trace action
+code that uses the var_ref_idx values to assign param values.
+
+The diagram below describes the situation for the sched_switch
 histogram referred to before::
 
-  # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >>     | |
-          events/sched/sched_switch/trigger                                 | |
+  # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >>
+          events/sched/sched_switch/trigger
                                                                             | |
   +------------------+                                                      | |
   | hist_data        |                                                      | |
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst
index 2b98c1720a54..340bcb5099e7 100644
--- a/Documentation/trace/histogram.rst
+++ b/Documentation/trace/histogram.rst
@@ -186,8 +186,8 @@ Documentation written by Tom Zanussi
   The examples below provide a more concrete illustration of the
   concepts and typical usage patterns discussed above.
 
-'special' event fields
-------------------------
+2.1. 'special' event fields
+---------------------------
 
   There are a number of 'special event fields' available for use as
   keys or values in a hist trigger.  These look like and behave as if
@@ -204,16 +204,16 @@ Documentation written by Tom Zanussi
     common_cpu             int  the cpu on which the event occurred.
     ====================== ==== =======================================
 
-Extended error information
---------------------------
+2.2. Extended error information
+-------------------------------
 
   For some error conditions encountered when invoking a hist trigger
   command, extended error information is available via the
-  tracing/error_log file.  See Error Conditions in
-  :file:`Documentation/trace/ftrace.rst` for details.
+  tracing/error_log file.  See "Error conditions" section in
+  Documentation/trace/ftrace.rst for details.
 
-6.2 'hist' trigger examples
----------------------------
+2.3. 'hist' trigger examples
+----------------------------
 
   The first set of examples creates aggregations using the kmalloc
   event.  The fields that can be used for the hist trigger are listed
@@ -840,7 +840,7 @@ Extended error information
 
   The compound key examples used a key and a sum value (hitcount) to
   sort the output, but we can just as easily use two keys instead.
-  Here's an example where we use a compound key composed of the the
+  Here's an example where we use a compound key composed of the
   common_pid and size event fields.  Sorting with pid as the primary
   key and 'size' as the secondary key allows us to display an
   ordered summary of the recvfrom sizes, with counts, received by
@@ -1608,8 +1608,8 @@ Extended error information
         Entries: 7
         Dropped: 0
 
-2.2 Inter-event hist triggers
------------------------------
+2.4. Inter-event hist triggers
+------------------------------
 
 Inter-event hist triggers are hist triggers that combine values from
 one or more other events and create a histogram using that data.  Data
@@ -1685,8 +1685,8 @@ pseudo-file.
 
 These features are described in more detail in the following sections.
 
-2.2.1 Histogram Variables
--------------------------
+2.5. Histogram Variables
+------------------------
 
 Variables are simply named locations used for saving and retrieving
 values between matching events.  A 'matching' event is defined as an
@@ -1789,8 +1789,8 @@ or assigned to a variable and referenced in a subsequent expression::
 
 Variables can even hold stacktraces, which are useful with synthetic events.
 
-2.2.2 Synthetic Events
-----------------------
+2.6. Synthetic Events
+---------------------
 
 Synthetic events are user-defined events generated from hist trigger
 variables or fields associated with one or more other events.  Their
@@ -1846,7 +1846,7 @@ the command that defined it with a '!'::
 At this point, there isn't yet an actual 'wakeup_latency' event
 instantiated in the event subsystem - for this to happen, a 'hist
 trigger action' needs to be instantiated and bound to actual fields
-and variables defined on other events (see Section 2.2.3 below on
+and variables defined on other events (see Section 2.7. below on
 how that is done using hist trigger 'onmatch' action). Once that is
 done, the 'wakeup_latency' synthetic event instance is created.
 
@@ -2094,8 +2094,8 @@ histogram::
     Entries: 7
     Dropped: 0
 
-2.2.3 Hist trigger 'handlers' and 'actions'
--------------------------------------------
+2.7. Hist trigger 'handlers' and 'actions'
+------------------------------------------
 
 A hist trigger 'action' is a function that's executed (in most cases
 conditionally) whenever a histogram entry is added or updated.
@@ -2526,8 +2526,8 @@ The following commonly-used handler.action pairs are available:
          kworker/3:2-135   [003] d..3    49.823123: sched_switch: prev_comm=kworker/3:2 prev_pid=135 prev_prio=120 prev_state=T ==> next_comm=swapper/3 next_pid=0 next_prio=120
               <idle>-0     [004] ..s7    49.823798: tcp_probe: src=10.0.0.10:54326 dest=23.215.104.193:80 mark=0x0 length=32 snd_nxt=0xe3ae2ff5 snd_una=0xe3ae2ecd snd_cwnd=10 ssthresh=2147483647 snd_wnd=28960 srtt=19604 rcv_wnd=29312
 
-3. User space creating a trigger
---------------------------------
+2.8. User space creating a trigger
+----------------------------------
 
 Writing into /sys/kernel/tracing/trace_marker writes into the ftrace
 ring buffer. This can also act like an event, by writing into the trigger
diff --git a/Documentation/trace/rv/monitor_synthesis.rst b/Documentation/trace/rv/monitor_synthesis.rst
index ac808a7554f5..3a7d7b2f6cb6 100644
--- a/Documentation/trace/rv/monitor_synthesis.rst
+++ b/Documentation/trace/rv/monitor_synthesis.rst
@@ -181,7 +181,7 @@ which is the list of atomic propositions present in the LTL specification
 functions interacting with the Buchi automaton.
 
 While generating code, `rvgen` cannot understand the meaning of the atomic
-propositions. Thus, that task is left for manual work. The recommended pratice
+propositions. Thus, that task is left for manual work. The recommended practice
 is adding tracepoints to places where the atomic propositions change; and in the
 tracepoints' handlers: the Buchi automaton is executed using::
 
diff --git a/Documentation/translations/it_IT/process/changes.rst b/Documentation/translations/it_IT/process/changes.rst
index 77db13c4022b..7e93833b4511 100644
--- a/Documentation/translations/it_IT/process/changes.rst
+++ b/Documentation/translations/it_IT/process/changes.rst
@@ -46,7 +46,6 @@ util-linux             2.10o              mount --version
 kmod                   13                 depmod -V
 e2fsprogs              1.41.4             e2fsck -V
 jfsutils               1.1.3              fsck.jfs -V
-reiserfsprogs          3.6.3              reiserfsck -V
 xfsprogs               2.6.0              xfs_db -V
 squashfs-tools         4.0                mksquashfs -version
 btrfs-progs            0.18               btrfsck
@@ -260,14 +259,6 @@ Sono disponibili i seguenti strumenti:
 
 - sono disponibili altri strumenti per il file-system.
 
-Reiserfsprogs
--------------
-
-Il pacchetto reiserfsprogs dovrebbe essere usato con reiserfs-3.6.x (Linux
-kernel 2.4.x).  Questo è un pacchetto combinato che contiene versioni
-funzionanti di ``mkreiserfs``, ``resize_reiserfs``, ``debugreiserfs`` e
-``reiserfsck``.  Questi programmi funzionano sulle piattaforme i386 e alpha.
-
 Xfsprogs
 --------
 
@@ -479,11 +470,6 @@ JFSutils
 
 - <https://jfs.sourceforge.net/>
 
-Reiserfsprogs
--------------
-
-- <https://git.kernel.org/pub/scm/linux/kernel/git/jeffm/reiserfsprogs.git/>
-
 Xfsprogs
 --------
 
diff --git a/Documentation/userspace-api/media/Makefile b/Documentation/userspace-api/media/Makefile
deleted file mode 100644
index 3d8aaf5c253b..000000000000
--- a/Documentation/userspace-api/media/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-# Rules to convert a .h file to inline RST documentation
-
-SRC_DIR=$(srctree)/Documentation/userspace-api/media
-PARSER = $(srctree)/Documentation/sphinx/parse-headers.pl
-UAPI = $(srctree)/include/uapi/linux
-KAPI = $(srctree)/include/linux
-
-FILES = ca.h.rst dmx.h.rst frontend.h.rst net.h.rst \
-	videodev2.h.rst media.h.rst cec.h.rst lirc.h.rst
-
-TARGETS := $(addprefix $(BUILDDIR)/, $(FILES))
-
-gen_rst = \
-	echo ${PARSER} $< $@ $(SRC_DIR)/$(notdir $@).exceptions; \
-	${PARSER} $< $@ $(SRC_DIR)/$(notdir $@).exceptions
-
-quiet_gen_rst = echo '  PARSE   $(patsubst $(srctree)/%,%,$<)'; \
-	${PARSER} $< $@ $(SRC_DIR)/$(notdir $@).exceptions
-
-silent_gen_rst = ${gen_rst}
-
-$(BUILDDIR)/ca.h.rst: ${UAPI}/dvb/ca.h ${PARSER} $(SRC_DIR)/ca.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/dmx.h.rst: ${UAPI}/dvb/dmx.h ${PARSER} $(SRC_DIR)/dmx.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/frontend.h.rst: ${UAPI}/dvb/frontend.h ${PARSER} $(SRC_DIR)/frontend.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/net.h.rst: ${UAPI}/dvb/net.h ${PARSER} $(SRC_DIR)/net.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/videodev2.h.rst: ${UAPI}/videodev2.h ${PARSER} $(SRC_DIR)/videodev2.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/media.h.rst: ${UAPI}/media.h ${PARSER} $(SRC_DIR)/media.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/cec.h.rst: ${UAPI}/cec.h ${PARSER} $(SRC_DIR)/cec.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-$(BUILDDIR)/lirc.h.rst: ${UAPI}/lirc.h ${PARSER} $(SRC_DIR)/lirc.h.rst.exceptions
-	@$($(quiet)gen_rst)
-
-# Media build rules
-
-.PHONY: all html texinfo epub xml latex
-
-all: $(IMGDOT) $(BUILDDIR) ${TARGETS}
-html: all
-texinfo: all
-epub: all
-xml: all
-latex: $(IMGPDF) all
-linkcheck:
-
-clean:
-	-rm -f $(DOTTGT) $(IMGTGT) ${TARGETS} 2>/dev/null
-
-$(BUILDDIR):
-	$(Q)mkdir -p $@
diff --git a/Documentation/userspace-api/media/cec/cec-header.rst b/Documentation/userspace-api/media/cec/cec-header.rst
index d70736ac2b1d..f67003bb8740 100644
--- a/Documentation/userspace-api/media/cec/cec-header.rst
+++ b/Documentation/userspace-api/media/cec/cec-header.rst
@@ -6,5 +6,6 @@
 CEC Header File
 ***************
 
-.. kernel-include:: $BUILDDIR/cec.h.rst
-
+.. kernel-include:: include/uapi/linux/cec.h
+    :generate-cross-refs:
+    :exception-file: cec.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/cec.h.rst.exceptions b/Documentation/userspace-api/media/cec/cec.h.rst.exceptions
index 15fa1752d4ef..15fa1752d4ef 100644
--- a/Documentation/userspace-api/media/cec.h.rst.exceptions
+++ b/Documentation/userspace-api/media/cec/cec.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/ca.h.rst.exceptions b/Documentation/userspace-api/media/dvb/ca.h.rst.exceptions
index f6828238eb48..f6828238eb48 100644
--- a/Documentation/userspace-api/media/ca.h.rst.exceptions
+++ b/Documentation/userspace-api/media/dvb/ca.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/dmx.h.rst.exceptions b/Documentation/userspace-api/media/dvb/dmx.h.rst.exceptions
index afc14d384b83..afc14d384b83 100644
--- a/Documentation/userspace-api/media/dmx.h.rst.exceptions
+++ b/Documentation/userspace-api/media/dvb/dmx.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/frontend.h.rst.exceptions b/Documentation/userspace-api/media/dvb/frontend.h.rst.exceptions
index dcaf5740de7e..dcaf5740de7e 100644
--- a/Documentation/userspace-api/media/frontend.h.rst.exceptions
+++ b/Documentation/userspace-api/media/dvb/frontend.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/dvb/headers.rst b/Documentation/userspace-api/media/dvb/headers.rst
index 88c3eb33a89e..c75f64cf21d5 100644
--- a/Documentation/userspace-api/media/dvb/headers.rst
+++ b/Documentation/userspace-api/media/dvb/headers.rst
@@ -7,10 +7,19 @@ Digital TV uAPI header files
 Digital TV uAPI headers
 ***********************
 
-.. kernel-include:: $BUILDDIR/frontend.h.rst
+.. kernel-include:: include/uapi/linux/dvb/frontend.h
+    :generate-cross-refs:
+    :exception-file: frontend.h.rst.exceptions
 
-.. kernel-include:: $BUILDDIR/dmx.h.rst
+.. kernel-include:: include/uapi/linux/dvb/dmx.h
+    :generate-cross-refs:
+    :exception-file: dmx.h.rst.exceptions
 
-.. kernel-include:: $BUILDDIR/ca.h.rst
+.. kernel-include:: include/uapi/linux/dvb/ca.h
+    :generate-cross-refs:
+    :exception-file: ca.h.rst.exceptions
+
+.. kernel-include:: include/uapi/linux/dvb/net.h
+    :generate-cross-refs:
+    :exception-file: net.h.rst.exceptions
 
-.. kernel-include:: $BUILDDIR/net.h.rst
diff --git a/Documentation/userspace-api/media/net.h.rst.exceptions b/Documentation/userspace-api/media/dvb/net.h.rst.exceptions
index 5159aa4bbbb9..5159aa4bbbb9 100644
--- a/Documentation/userspace-api/media/net.h.rst.exceptions
+++ b/Documentation/userspace-api/media/dvb/net.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/mediactl/media-header.rst b/Documentation/userspace-api/media/mediactl/media-header.rst
index c674271c93f5..d561d2845f3d 100644
--- a/Documentation/userspace-api/media/mediactl/media-header.rst
+++ b/Documentation/userspace-api/media/mediactl/media-header.rst
@@ -6,5 +6,6 @@
 Media Controller Header File
 ****************************
 
-.. kernel-include:: $BUILDDIR/media.h.rst
-
+.. kernel-include:: include/uapi/linux/media.h
+    :generate-cross-refs:
+    :exception-file: media.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/media.h.rst.exceptions b/Documentation/userspace-api/media/mediactl/media.h.rst.exceptions
index 9b4c26502d95..9b4c26502d95 100644
--- a/Documentation/userspace-api/media/media.h.rst.exceptions
+++ b/Documentation/userspace-api/media/mediactl/media.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/rc/lirc-header.rst b/Documentation/userspace-api/media/rc/lirc-header.rst
index 54cb40b8a065..a53328327847 100644
--- a/Documentation/userspace-api/media/rc/lirc-header.rst
+++ b/Documentation/userspace-api/media/rc/lirc-header.rst
@@ -6,5 +6,7 @@
 LIRC Header File
 ****************
 
-.. kernel-include:: $BUILDDIR/lirc.h.rst
+.. kernel-include:: include/uapi/linux/lirc.h
+    :generate-cross-refs:
+    :exception-file: lirc.h.rst.exceptions
 
diff --git a/Documentation/userspace-api/media/lirc.h.rst.exceptions b/Documentation/userspace-api/media/rc/lirc.h.rst.exceptions
index 1aeb7d7afe13..1aeb7d7afe13 100644
--- a/Documentation/userspace-api/media/lirc.h.rst.exceptions
+++ b/Documentation/userspace-api/media/rc/lirc.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/v4l/videodev.rst b/Documentation/userspace-api/media/v4l/videodev.rst
index c866fec417eb..cde485bc9a5f 100644
--- a/Documentation/userspace-api/media/v4l/videodev.rst
+++ b/Documentation/userspace-api/media/v4l/videodev.rst
@@ -6,4 +6,6 @@
 Video For Linux Two Header File
 *******************************
 
-.. kernel-include:: $BUILDDIR/videodev2.h.rst
+.. kernel-include:: include/uapi/linux/videodev2.h
+    :generate-cross-refs:
+    :exception-file: videodev2.h.rst.exceptions
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/v4l/videodev2.h.rst.exceptions
index 35d3456cc812..35d3456cc812 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/v4l/videodev2.h.rst.exceptions
diff --git a/Documentation/virt/kvm/review-checklist.rst b/Documentation/virt/kvm/review-checklist.rst
index debac54e14e7..053f00c50d66 100644
--- a/Documentation/virt/kvm/review-checklist.rst
+++ b/Documentation/virt/kvm/review-checklist.rst
@@ -98,7 +98,7 @@ New APIs
   It is important to demonstrate your use case.  This can be as simple as
   explaining that the feature is already in use on bare metal, or it can be
   a proof-of-concept implementation in userspace.  The latter need not be
-  open source, though that is of course preferrable for easier testing.
+  open source, though that is of course preferable for easier testing.
   Selftests should test corner cases of the APIs, and should also cover
   basic host and guest operation if no open source VMM uses the feature.
 
diff --git a/Documentation/w1/masters/ds2482.rst b/Documentation/w1/masters/ds2482.rst
index 17ebe8f660cd..5862024e4b15 100644
--- a/Documentation/w1/masters/ds2482.rst
+++ b/Documentation/w1/masters/ds2482.rst
@@ -22,7 +22,7 @@ Description
 -----------
 
 The Maxim/Dallas Semiconductor DS2482 is a I2C device that provides
-one (DS2482-100) or eight (DS2482-800) 1-wire busses.
+one (DS2482-100) or eight (DS2482-800) 1-wire buses.
 
 
 General Remarks
diff --git a/Documentation/w1/masters/index.rst b/Documentation/w1/masters/index.rst
index cc40189909fd..871442c7f195 100644
--- a/Documentation/w1/masters/index.rst
+++ b/Documentation/w1/masters/index.rst
@@ -1,4 +1,4 @@
-. SPDX-License-Identifier: GPL-2.0
+.. SPDX-License-Identifier: GPL-2.0
 
 =====================
 1-wire Master Drivers
diff --git a/Documentation/w1/slaves/index.rst b/Documentation/w1/slaves/index.rst
index d0697b202f09..a210f38c889c 100644
--- a/Documentation/w1/slaves/index.rst
+++ b/Documentation/w1/slaves/index.rst
@@ -1,4 +1,4 @@
-. SPDX-License-Identifier: GPL-2.0
+.. SPDX-License-Identifier: GPL-2.0
 
 ====================
 1-wire Slave Drivers
diff --git a/MAINTAINERS b/MAINTAINERS
index fa3e51d84399..1e32d13783d3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -347,6 +347,7 @@ L:	linux-acpi@vger.kernel.org
 L:	linux-riscv@lists.infradead.org
 S:	Maintained
 F:	drivers/acpi/riscv/
+F:	include/linux/acpi_rimt.h
 
 ACPI PCC(Platform Communication Channel) MAILBOX DRIVER
 M:	Sudeep Holla <sudeep.holla@arm.com>
@@ -1176,6 +1177,15 @@ F:	Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
 F:	drivers/net/ethernet/amd/pds_core/
 F:	include/linux/pds/
 
+AMD PENSANDO RDMA DRIVER
+M:	Abhijit Gangurde <abhijit.gangurde@amd.com>
+M:	Allen Hubbe <allen.hubbe@amd.com>
+L:	linux-rdma@vger.kernel.org
+S:	Maintained
+F:	Documentation/networking/device_drivers/ethernet/pensando/ionic_rdma.rst
+F:	drivers/infiniband/hw/ionic/
+F:	include/uapi/rdma/ionic-abi.h
+
 AMD PMC DRIVER
 M:	Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
 L:	platform-driver-x86@vger.kernel.org
@@ -7123,6 +7133,14 @@ S:	Maintained
 F:	Documentation/admin-guide/device-mapper/vdo*.rst
 F:	drivers/md/dm-vdo/
 
+DEVICE-MAPPER PCACHE TARGET
+M:	Dongsheng Yang <dongsheng.yang@linux.dev>
+M:	Zheng Gu <cengku@gmail.com>
+L:	dm-devel@lists.linux.dev
+S:	Maintained
+F:	Documentation/admin-guide/device-mapper/dm-pcache.rst
+F:	drivers/md/dm-pcache/
+
 DEVLINK
 M:	Jiri Pirko <jiri@resnulli.us>
 L:	netdev@vger.kernel.org
@@ -7384,11 +7402,13 @@ P:	Documentation/doc-guide/maintainer-profile.rst
 T:	git git://git.lwn.net/linux.git docs-next
 F:	Documentation/
 F:	scripts/check-variable-fonts.sh
+F:	scripts/checktransupdate.py
 F:	scripts/documentation-file-ref-check
 F:	scripts/get_abi.py
 F:	scripts/kernel-doc*
 F:	scripts/lib/abi/*
 F:	scripts/lib/kdoc/*
+F:	tools/docs/*
 F:	tools/net/ynl/pyynl/lib/doc_generator.py
 F:	scripts/sphinx-pre-install
 X:	Documentation/ABI/
@@ -9115,7 +9135,6 @@ F:	drivers/infiniband/hw/ocrdma/
 F:	include/uapi/rdma/ocrdma-abi.h
 
 EMULEX/BROADCOM EFCT FC/FCOE SCSI TARGET DRIVER
-M:	James Smart <james.smart@broadcom.com>
 M:	Ram Vegesna <ram.vegesna@broadcom.com>
 L:	linux-scsi@vger.kernel.org
 L:	target-devel@vger.kernel.org
@@ -9124,8 +9143,8 @@ W:	http://www.broadcom.com
 F:	drivers/scsi/elx/
 
 EMULEX/BROADCOM LPFC FC/FCOE SCSI DRIVER
-M:	James Smart <james.smart@broadcom.com>
-M:	Dick Kennedy <dick.kennedy@broadcom.com>
+M:	Justin Tee <justin.tee@broadcom.com>
+M:	Paul Ely <paul.ely@broadcom.com>
 L:	linux-scsi@vger.kernel.org
 S:	Supported
 W:	http://www.broadcom.com
@@ -14323,6 +14342,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rcu/linux.git rcu/dev
 F:	Documentation/atomic_bitops.txt
 F:	Documentation/atomic_t.txt
 F:	Documentation/core-api/refcount-vs-atomic.rst
+F:	Documentation/dev-tools/lkmm/
 F:	Documentation/litmus-tests/
 F:	Documentation/memory-barriers.txt
 F:	tools/memory-model/
@@ -18384,7 +18404,9 @@ F:	drivers/nvme/target/fabrics-cmd-auth.c
 F:	include/linux/nvme-auth.h
 
 NVM EXPRESS FC TRANSPORT DRIVERS
-M:	James Smart <james.smart@broadcom.com>
+M:	Justin Tee <justin.tee@broadcom.com>
+M:	Naresh Gottumukkala <nareshgottumukkala83@gmail.com>
+M:	Paul Ely <paul.ely@broadcom.com>
 L:	linux-nvme@lists.infradead.org
 S:	Supported
 F:	drivers/nvme/host/fc.c
diff --git a/Makefile b/Makefile
index d37dca7850b3..6f88b1d3b40d 100644
--- a/Makefile
+++ b/Makefile
@@ -1797,8 +1797,9 @@ $(help-board-dirs): help-%:
 
 # Documentation targets
 # ---------------------------------------------------------------------------
-DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs epubdocs cleandocs \
-	       linkcheckdocs dochelp refcheckdocs texinfodocs infodocs
+DOC_TARGETS := xmldocs latexdocs pdfdocs htmldocs htmldocs-redirects \
+	       epubdocs cleandocs linkcheckdocs dochelp refcheckdocs \
+	       texinfodocs infodocs
 PHONY += $(DOC_TARGETS)
 $(DOC_TARGETS):
 	$(Q)$(MAKE) $(build)=Documentation $@
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 4d64a5db50f3..0359ab72cd3b 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -14,7 +14,7 @@
 #define can_map_direct(dev, addr) \
 	((dev)->bus_dma_limit >= phys_to_dma((dev), (addr)))
 
-bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr)
+bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr)
 {
 	if (likely(!dev->bus_dma_limit))
 		return false;
@@ -24,7 +24,7 @@ bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr)
 
 #define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset)
 
-bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle)
+bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle)
 {
 	if (likely(!dev->bus_dma_limit))
 		return false;
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 6eb9db8017b8..0c6038dc5dfd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -16,6 +16,7 @@ config RISCV
 	select ACPI_MCFG if (ACPI && PCI)
 	select ACPI_PPTT if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
+	select ACPI_RIMT if ACPI
 	select ACPI_SPCR_TABLE if ACPI
 	select ARCH_DMA_DEFAULT_COHERENT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index b594780a57d7..2cdbd08b30e4 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -547,6 +547,10 @@ if ARM64
 source "drivers/acpi/arm64/Kconfig"
 endif
 
+if RISCV
+source "drivers/acpi/riscv/Kconfig"
+endif
+
 config ACPI_PPTT
 	bool
 
diff --git a/drivers/acpi/riscv/Kconfig b/drivers/acpi/riscv/Kconfig
new file mode 100644
index 000000000000..046296a18d00
--- /dev/null
+++ b/drivers/acpi/riscv/Kconfig
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# ACPI Configuration for RISC-V
+#
+
+config ACPI_RIMT
+	bool
diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile
index a96fdf1e2cb8..1284a076fa88 100644
--- a/drivers/acpi/riscv/Makefile
+++ b/drivers/acpi/riscv/Makefile
@@ -2,3 +2,4 @@
 obj-y					+= rhct.o init.o irq.o
 obj-$(CONFIG_ACPI_PROCESSOR_IDLE)	+= cpuidle.o
 obj-$(CONFIG_ACPI_CPPC_LIB)		+= cppc.o
+obj-$(CONFIG_ACPI_RIMT)			+= rimt.o
diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c
index 673e4d5dd752..7c00f7995e86 100644
--- a/drivers/acpi/riscv/init.c
+++ b/drivers/acpi/riscv/init.c
@@ -10,4 +10,6 @@
 void __init acpi_arch_init(void)
 {
 	riscv_acpi_init_gsi_mapping();
+	if (IS_ENABLED(CONFIG_ACPI_RIMT))
+		riscv_acpi_rimt_init();
 }
diff --git a/drivers/acpi/riscv/init.h b/drivers/acpi/riscv/init.h
index 0b9a07e4031f..1680aa2aaf23 100644
--- a/drivers/acpi/riscv/init.h
+++ b/drivers/acpi/riscv/init.h
@@ -2,3 +2,4 @@
 #include <linux/init.h>
 
 void __init riscv_acpi_init_gsi_mapping(void);
+void __init riscv_acpi_rimt_init(void);
diff --git a/drivers/acpi/riscv/rimt.c b/drivers/acpi/riscv/rimt.c
new file mode 100644
index 000000000000..683fcfe35c31
--- /dev/null
+++ b/drivers/acpi/riscv/rimt.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024-2025, Ventana Micro Systems Inc
+ *	Author: Sunil V L <sunilvl@ventanamicro.com>
+ *
+ */
+
+#define pr_fmt(fmt)	"ACPI: RIMT: " fmt
+
+#include <linux/acpi.h>
+#include <linux/acpi_rimt.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include "init.h"
+
+struct rimt_fwnode {
+	struct list_head list;
+	struct acpi_rimt_node *rimt_node;
+	struct fwnode_handle *fwnode;
+};
+
+static LIST_HEAD(rimt_fwnode_list);
+static DEFINE_SPINLOCK(rimt_fwnode_lock);
+
+#define RIMT_TYPE_MASK(type)	(1 << (type))
+#define RIMT_IOMMU_TYPE		BIT(0)
+
+/* Root pointer to the mapped RIMT table */
+static struct acpi_table_header *rimt_table;
+
+/**
+ * rimt_set_fwnode() - Create rimt_fwnode and use it to register
+ *		       iommu data in the rimt_fwnode_list
+ *
+ * @rimt_node: RIMT table node associated with the IOMMU
+ * @fwnode: fwnode associated with the RIMT node
+ *
+ * Returns: 0 on success
+ *          <0 on failure
+ */
+static int rimt_set_fwnode(struct acpi_rimt_node *rimt_node,
+			   struct fwnode_handle *fwnode)
+{
+	struct rimt_fwnode *np;
+
+	np = kzalloc(sizeof(*np), GFP_ATOMIC);
+
+	if (WARN_ON(!np))
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&np->list);
+	np->rimt_node = rimt_node;
+	np->fwnode = fwnode;
+
+	spin_lock(&rimt_fwnode_lock);
+	list_add_tail(&np->list, &rimt_fwnode_list);
+	spin_unlock(&rimt_fwnode_lock);
+
+	return 0;
+}
+
+/**
+ * rimt_get_fwnode() - Retrieve fwnode associated with an RIMT node
+ *
+ * @node: RIMT table node to be looked-up
+ *
+ * Returns: fwnode_handle pointer on success, NULL on failure
+ */
+static struct fwnode_handle *rimt_get_fwnode(struct acpi_rimt_node *node)
+{
+	struct fwnode_handle *fwnode = NULL;
+	struct rimt_fwnode *curr;
+
+	spin_lock(&rimt_fwnode_lock);
+	list_for_each_entry(curr, &rimt_fwnode_list, list) {
+		if (curr->rimt_node == node) {
+			fwnode = curr->fwnode;
+			break;
+		}
+	}
+	spin_unlock(&rimt_fwnode_lock);
+
+	return fwnode;
+}
+
+static acpi_status rimt_match_node_callback(struct acpi_rimt_node *node,
+					    void *context)
+{
+	acpi_status status = AE_NOT_FOUND;
+	struct device *dev = context;
+
+	if (node->type == ACPI_RIMT_NODE_TYPE_IOMMU) {
+		struct acpi_rimt_iommu *iommu_node = (struct acpi_rimt_iommu *)&node->node_data;
+
+		if (dev_is_pci(dev)) {
+			struct pci_dev *pdev;
+			u16 bdf;
+
+			pdev = to_pci_dev(dev);
+			bdf = PCI_DEVID(pdev->bus->number, pdev->devfn);
+			if ((pci_domain_nr(pdev->bus) == iommu_node->pcie_segment_number) &&
+			    bdf == iommu_node->pcie_bdf) {
+				status = AE_OK;
+			} else {
+				status = AE_NOT_FOUND;
+			}
+		} else {
+			struct platform_device *pdev = to_platform_device(dev);
+			struct resource *res;
+
+			res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+			if (res && res->start == iommu_node->base_address)
+				status = AE_OK;
+			else
+				status = AE_NOT_FOUND;
+		}
+	} else if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) {
+		struct acpi_rimt_pcie_rc *pci_rc;
+		struct pci_bus *bus;
+
+		bus = to_pci_bus(dev);
+		pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data;
+
+		/*
+		 * It is assumed that PCI segment numbers maps one-to-one
+		 * with root complexes. Each segment number can represent only
+		 * one root complex.
+		 */
+		status = pci_rc->pcie_segment_number == pci_domain_nr(bus) ?
+							AE_OK : AE_NOT_FOUND;
+	} else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) {
+		struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
+		struct acpi_rimt_platform_device *ncomp;
+		struct device *plat_dev = dev;
+		struct acpi_device *adev;
+
+		/*
+		 * Walk the device tree to find a device with an
+		 * ACPI companion; there is no point in scanning
+		 * RIMT for a device matching a platform device if
+		 * the device does not have an ACPI companion to
+		 * start with.
+		 */
+		do {
+			adev = ACPI_COMPANION(plat_dev);
+			if (adev)
+				break;
+
+			plat_dev = plat_dev->parent;
+		} while (plat_dev);
+
+		if (!adev)
+			return status;
+
+		status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf);
+		if (ACPI_FAILURE(status)) {
+			dev_warn(plat_dev, "Can't get device full path name\n");
+			return status;
+		}
+
+		ncomp = (struct acpi_rimt_platform_device *)node->node_data;
+		status = !strcmp(ncomp->device_name, buf.pointer) ?
+							AE_OK : AE_NOT_FOUND;
+		acpi_os_free(buf.pointer);
+	}
+
+	return status;
+}
+
+static struct acpi_rimt_node *rimt_scan_node(enum acpi_rimt_node_type type,
+					     void *context)
+{
+	struct acpi_rimt_node *rimt_node, *rimt_end;
+	struct acpi_table_rimt *rimt;
+	int i;
+
+	if (!rimt_table)
+		return NULL;
+
+	/* Get the first RIMT node */
+	rimt = (struct acpi_table_rimt *)rimt_table;
+	rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt,
+				 rimt->node_offset);
+	rimt_end = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table,
+				rimt_table->length);
+
+	for (i = 0; i < rimt->num_nodes; i++) {
+		if (WARN_TAINT(rimt_node >= rimt_end, TAINT_FIRMWARE_WORKAROUND,
+			       "RIMT node pointer overflows, bad table!\n"))
+			return NULL;
+
+		if (rimt_node->type == type &&
+		    ACPI_SUCCESS(rimt_match_node_callback(rimt_node, context)))
+			return rimt_node;
+
+		rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_node,
+					 rimt_node->length);
+	}
+
+	return NULL;
+}
+
+static bool rimt_pcie_rc_supports_ats(struct acpi_rimt_node *node)
+{
+	struct acpi_rimt_pcie_rc *pci_rc;
+
+	pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data;
+	return pci_rc->flags & ACPI_RIMT_PCIE_ATS_SUPPORTED;
+}
+
+static int rimt_iommu_xlate(struct device *dev, struct acpi_rimt_node *node, u32 deviceid)
+{
+	struct fwnode_handle *rimt_fwnode;
+
+	if (!node)
+		return -ENODEV;
+
+	rimt_fwnode = rimt_get_fwnode(node);
+
+	/*
+	 * The IOMMU drivers may not be probed yet.
+	 * Defer the IOMMU configuration
+	 */
+	if (!rimt_fwnode)
+		return -EPROBE_DEFER;
+
+	return acpi_iommu_fwspec_init(dev, deviceid, rimt_fwnode);
+}
+
+struct rimt_pci_alias_info {
+	struct device *dev;
+	struct acpi_rimt_node *node;
+	const struct iommu_ops *ops;
+};
+
+static int rimt_id_map(struct acpi_rimt_id_mapping *map, u8 type, u32 rid_in, u32 *rid_out)
+{
+	if (rid_in < map->source_id_base ||
+	    (rid_in > map->source_id_base + map->num_ids))
+		return -ENXIO;
+
+	*rid_out = map->dest_id_base + (rid_in - map->source_id_base);
+	return 0;
+}
+
+static struct acpi_rimt_node *rimt_node_get_id(struct acpi_rimt_node *node,
+					       u32 *id_out, int index)
+{
+	struct acpi_rimt_platform_device *plat_node;
+	u32 id_mapping_offset, num_id_mapping;
+	struct acpi_rimt_pcie_rc *pci_node;
+	struct acpi_rimt_id_mapping *map;
+	struct acpi_rimt_node *parent;
+
+	if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) {
+		pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data;
+		id_mapping_offset = pci_node->id_mapping_offset;
+		num_id_mapping = pci_node->num_id_mappings;
+	} else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) {
+		plat_node = (struct acpi_rimt_platform_device *)&node->node_data;
+		id_mapping_offset = plat_node->id_mapping_offset;
+		num_id_mapping = plat_node->num_id_mappings;
+	} else {
+		return NULL;
+	}
+
+	if (!id_mapping_offset || !num_id_mapping || index >= num_id_mapping)
+		return NULL;
+
+	map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node,
+			   id_mapping_offset + index * sizeof(*map));
+
+	/* Firmware bug! */
+	if (!map->dest_offset) {
+		pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n",
+		       node, node->type);
+		return NULL;
+	}
+
+	parent = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, map->dest_offset);
+
+	if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE ||
+	    node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) {
+		*id_out = map->dest_id_base;
+		return parent;
+	}
+
+	return NULL;
+}
+
+/*
+ * RISC-V supports IOMMU as a PCI device or a platform device.
+ * When it is a platform device, there should be a namespace device as
+ * well along with RIMT. To create the link between RIMT information and
+ * the platform device, the IOMMU driver should register itself with the
+ * RIMT module. This is true for PCI based IOMMU as well.
+ */
+int rimt_iommu_register(struct device *dev)
+{
+	struct fwnode_handle *rimt_fwnode;
+	struct acpi_rimt_node *node;
+
+	node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_IOMMU, dev);
+	if (!node) {
+		pr_err("Could not find IOMMU node in RIMT\n");
+		return -ENODEV;
+	}
+
+	if (dev_is_pci(dev)) {
+		rimt_fwnode = acpi_alloc_fwnode_static();
+		if (!rimt_fwnode)
+			return -ENOMEM;
+
+		rimt_fwnode->dev = dev;
+		if (!dev->fwnode)
+			dev->fwnode = rimt_fwnode;
+
+		rimt_set_fwnode(node, rimt_fwnode);
+	} else {
+		rimt_set_fwnode(node, dev->fwnode);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_IOMMU_API
+
+static struct acpi_rimt_node *rimt_node_map_id(struct acpi_rimt_node *node,
+					       u32 id_in, u32 *id_out,
+					       u8 type_mask)
+{
+	struct acpi_rimt_platform_device *plat_node;
+	u32 id_mapping_offset, num_id_mapping;
+	struct acpi_rimt_pcie_rc *pci_node;
+	u32 id = id_in;
+
+	/* Parse the ID mapping tree to find specified node type */
+	while (node) {
+		struct acpi_rimt_id_mapping *map;
+		int i, rc = 0;
+		u32 map_id = id;
+
+		if (RIMT_TYPE_MASK(node->type) & type_mask) {
+			if (id_out)
+				*id_out = id;
+			return node;
+		}
+
+		if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) {
+			pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data;
+			id_mapping_offset = pci_node->id_mapping_offset;
+			num_id_mapping = pci_node->num_id_mappings;
+		} else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) {
+			plat_node = (struct acpi_rimt_platform_device *)&node->node_data;
+			id_mapping_offset = plat_node->id_mapping_offset;
+			num_id_mapping = plat_node->num_id_mappings;
+		} else {
+			goto fail_map;
+		}
+
+		if (!id_mapping_offset || !num_id_mapping)
+			goto fail_map;
+
+		map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node,
+				   id_mapping_offset);
+
+		/* Firmware bug! */
+		if (!map->dest_offset) {
+			pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n",
+			       node, node->type);
+			goto fail_map;
+		}
+
+		/* Do the ID translation */
+		for (i = 0; i < num_id_mapping; i++, map++) {
+			rc = rimt_id_map(map, node->type, map_id, &id);
+			if (!rc)
+				break;
+		}
+
+		if (i == num_id_mapping)
+			goto fail_map;
+
+		node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table,
+				    rc ? 0 : map->dest_offset);
+	}
+
+fail_map:
+	/* Map input ID to output ID unchanged on mapping failure */
+	if (id_out)
+		*id_out = id_in;
+
+	return NULL;
+}
+
+static struct acpi_rimt_node *rimt_node_map_platform_id(struct acpi_rimt_node *node, u32 *id_out,
+							u8 type_mask, int index)
+{
+	struct acpi_rimt_node *parent;
+	u32 id;
+
+	parent = rimt_node_get_id(node, &id, index);
+	if (!parent)
+		return NULL;
+
+	if (!(RIMT_TYPE_MASK(parent->type) & type_mask))
+		parent = rimt_node_map_id(parent, id, id_out, type_mask);
+	else
+		if (id_out)
+			*id_out = id;
+
+	return parent;
+}
+
+static int rimt_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct rimt_pci_alias_info *info = data;
+	struct acpi_rimt_node *parent;
+	u32 deviceid;
+
+	parent = rimt_node_map_id(info->node, alias, &deviceid, RIMT_IOMMU_TYPE);
+	return rimt_iommu_xlate(info->dev, parent, deviceid);
+}
+
+static int rimt_plat_iommu_map(struct device *dev, struct acpi_rimt_node *node)
+{
+	struct acpi_rimt_node *parent;
+	int err = -ENODEV, i = 0;
+	u32 deviceid = 0;
+
+	do {
+		parent = rimt_node_map_platform_id(node, &deviceid,
+						   RIMT_IOMMU_TYPE,
+						   i++);
+
+		if (parent)
+			err = rimt_iommu_xlate(dev, parent, deviceid);
+	} while (parent && !err);
+
+	return err;
+}
+
+static int rimt_plat_iommu_map_id(struct device *dev,
+				  struct acpi_rimt_node *node,
+				  const u32 *in_id)
+{
+	struct acpi_rimt_node *parent;
+	u32 deviceid;
+
+	parent = rimt_node_map_id(node, *in_id, &deviceid, RIMT_IOMMU_TYPE);
+	if (parent)
+		return rimt_iommu_xlate(dev, parent, deviceid);
+
+	return -ENODEV;
+}
+
+/**
+ * rimt_iommu_configure_id - Set-up IOMMU configuration for a device.
+ *
+ * @dev: device to configure
+ * @id_in: optional input id const value pointer
+ *
+ * Returns: 0 on success, <0 on failure
+ */
+int rimt_iommu_configure_id(struct device *dev, const u32 *id_in)
+{
+	struct acpi_rimt_node *node;
+	int err = -ENODEV;
+
+	if (dev_is_pci(dev)) {
+		struct iommu_fwspec *fwspec;
+		struct pci_bus *bus = to_pci_dev(dev)->bus;
+		struct rimt_pci_alias_info info = { .dev = dev };
+
+		node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX, &bus->dev);
+		if (!node)
+			return -ENODEV;
+
+		info.node = node;
+		err = pci_for_each_dma_alias(to_pci_dev(dev),
+					     rimt_pci_iommu_init, &info);
+
+		fwspec = dev_iommu_fwspec_get(dev);
+		if (fwspec && rimt_pcie_rc_supports_ats(node))
+			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+	} else {
+		node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PLAT_DEVICE, dev);
+		if (!node)
+			return -ENODEV;
+
+		err = id_in ? rimt_plat_iommu_map_id(dev, node, id_in) :
+			      rimt_plat_iommu_map(dev, node);
+	}
+
+	return err;
+}
+
+#endif
+
+void __init riscv_acpi_rimt_init(void)
+{
+	acpi_status status;
+
+	/* rimt_table will be used at runtime after the rimt init,
+	 * so we don't need to call acpi_put_table() to release
+	 * the RIMT table mapping.
+	 */
+	status = acpi_get_table(ACPI_SIG_RIMT, 0, &rimt_table);
+	if (ACPI_FAILURE(status)) {
+		if (status != AE_NOT_FOUND) {
+			const char *msg = acpi_format_exception(status);
+
+			pr_err("Failed to get table, %s\n", msg);
+		}
+
+		return;
+	}
+}
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 880a544d73cd..065abe56f440 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/acpi_iort.h>
+#include <linux/acpi_rimt.h>
 #include <linux/acpi_viot.h>
 #include <linux/iommu.h>
 #include <linux/signal.h>
@@ -1631,7 +1632,10 @@ static int acpi_iommu_configure_id(struct device *dev, const u32 *id_in)
 
 	err = iort_iommu_configure_id(dev, id_in);
 	if (err && err != -EPROBE_DEFER)
+		err = rimt_iommu_configure_id(dev, id_in);
+	if (err && err != -EPROBE_DEFER)
 		err = viot_iommu_configure(dev);
+
 	mutex_unlock(&iommu_probe_device_lock);
 
 	return err;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 65d6d0af140a..8dff5c2c40fd 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -28,6 +28,7 @@
 #include <linux/fs_struct.h>
 #include <linux/psp.h>
 #include <linux/amd-iommu.h>
+#include <linux/crash_dump.h>
 
 #include <asm/smp.h>
 #include <asm/cacheflush.h>
@@ -1526,6 +1527,15 @@ static int _sev_platform_init_locked(struct sev_platform_init_args *args)
 	if (!psp_master || !psp_master->sev_data)
 		return -ENODEV;
 
+	/*
+	 * Skip SNP/SEV initialization under a kdump kernel as SEV/SNP
+	 * may already be initialized in the previous kernel. Since no
+	 * SNP/SEV guests are run under a kdump kernel, there is no
+	 * need to initialize SNP or SEV during kdump boot.
+	 */
+	if (is_kdump_kernel())
+		return 0;
+
 	sev = psp_master->sev_data;
 
 	if (sev->state == SEV_STATE_INIT)
diff --git a/drivers/fwctl/mlx5/main.c b/drivers/fwctl/mlx5/main.c
index f93aa0cecdb9..3dacccf7855c 100644
--- a/drivers/fwctl/mlx5/main.c
+++ b/drivers/fwctl/mlx5/main.c
@@ -58,6 +58,9 @@ enum {
 	MLX5_CMD_OP_QUERY_DC_CNAK_TRACE = 0x716,
 	MLX5_CMD_OP_QUERY_NVMF_BACKEND_CONTROLLER = 0x722,
 	MLX5_CMD_OP_QUERY_NVMF_NAMESPACE_CONTEXT = 0x728,
+	MLX5_CMD_OP_QUERY_ADJACENT_FUNCTIONS_ID = 0x730,
+	MLX5_CMD_OP_DELEGATE_VHCA_MANAGEMENT = 0x731,
+	MLX5_CMD_OP_QUERY_DELEGATED_VHCA = 0x732,
 	MLX5_CMD_OP_QUERY_BURST_SIZE = 0x813,
 	MLX5_CMD_OP_QUERY_DIAGNOSTIC_PARAMS = 0x819,
 	MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS = 0x820,
@@ -188,6 +191,7 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope)
 	 * filter commands manually for now.
 	 */
 	switch (opcode) {
+	case MLX5_CMD_OP_MODIFY_CONG_STATUS:
 	case MLX5_CMD_OP_POSTPONE_CONNECTED_QP_TIMEOUT:
 	case MLX5_CMD_OP_QUERY_ADAPTER:
 	case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS:
@@ -196,6 +200,7 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope)
 	case MLX5_CMD_OP_QUERY_OTHER_HCA_CAP:
 	case MLX5_CMD_OP_QUERY_ROCE_ADDRESS:
 	case MLX5_CMD_OPCODE_QUERY_VUID:
+	case MLX5_CMD_OP_DELEGATE_VHCA_MANAGEMENT:
 	/*
 	 * FW limits SET_HCA_CAP on the tools UID to only the other function
 	 * mode which is used for function pre-configuration
@@ -281,6 +286,8 @@ static bool mlx5ctl_validate_rpc(const void *in, enum fwctl_rpc_scope scope)
 	case MLX5_CMD_OP_QUERY_XRQ:
 	case MLX5_CMD_OP_USER_QUERY_XRQ_DC_PARAMS_ENTRY:
 	case MLX5_CMD_OP_USER_QUERY_XRQ_ERROR_PARAMS:
+	case MLX5_CMD_OP_QUERY_ADJACENT_FUNCTIONS_ID:
+	case MLX5_CMD_OP_QUERY_DELEGATED_VHCA:
 		return scope >= FWCTL_RPC_DEBUG_READ_ONLY;
 
 	case MLX5_CMD_OP_SET_DIAGNOSTIC_PARAMS:
@@ -345,7 +352,7 @@ static void *mlx5ctl_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope,
 	 */
 	if (ret && ret != -EREMOTEIO) {
 		if (rpc_out != rpc_in)
-			kfree(rpc_out);
+			kvfree(rpc_out);
 		return ERR_PTR(ret);
 	}
 	return rpc_out;
diff --git a/drivers/fwctl/pds/main.c b/drivers/fwctl/pds/main.c
index 9b9d1f6b5556..1809853f6353 100644
--- a/drivers/fwctl/pds/main.c
+++ b/drivers/fwctl/pds/main.c
@@ -6,6 +6,7 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/bitfield.h>
+#include <linux/string.h>
 
 #include <uapi/fwctl/fwctl.h>
 #include <uapi/fwctl/pds.h>
@@ -366,18 +367,10 @@ static void *pdsfc_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope,
 		return ERR_PTR(err);
 
 	if (rpc->in.len > 0) {
-		in_payload = kzalloc(rpc->in.len, GFP_KERNEL);
-		if (!in_payload) {
-			dev_err(dev, "Failed to allocate in_payload\n");
-			err = -ENOMEM;
-			goto err_out;
-		}
-
-		if (copy_from_user(in_payload, u64_to_user_ptr(rpc->in.payload),
-				   rpc->in.len)) {
+		in_payload = memdup_user(u64_to_user_ptr(rpc->in.payload), rpc->in.len);
+		if (IS_ERR(in_payload)) {
 			dev_dbg(dev, "Failed to copy in_payload from user\n");
-			err = -EFAULT;
-			goto err_in_payload;
+			return in_payload;
 		}
 
 		in_payload_dma_addr = dma_map_single(dev->parent, in_payload,
@@ -453,7 +446,6 @@ err_out_payload:
 				 rpc->in.len, DMA_TO_DEVICE);
 err_in_payload:
 	kfree(in_payload);
-err_out:
 	if (err)
 		return ERR_PTR(err);
 
@@ -481,7 +473,7 @@ static int pdsfc_probe(struct auxiliary_device *adev,
 	pdsfc = fwctl_alloc_device(&padev->vf_pdev->dev, &pdsfc_ops,
 				   struct pdsfc_dev, fwctl);
 	if (!pdsfc)
-		return dev_err_probe(dev, -ENOMEM, "Failed to allocate fwctl device struct\n");
+		return -ENOMEM;
 	pdsfc->padev = padev;
 
 	err = pdsfc_identify(pdsfc);
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 3a394cd772f6..f0323f1d6f01 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -85,6 +85,7 @@ source "drivers/infiniband/hw/efa/Kconfig"
 source "drivers/infiniband/hw/erdma/Kconfig"
 source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/hns/Kconfig"
+source "drivers/infiniband/hw/ionic/Kconfig"
 source "drivers/infiniband/hw/irdma/Kconfig"
 source "drivers/infiniband/hw/mana/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index be0743dac3ff..61596cda2b65 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -446,63 +446,41 @@ static int addr6_resolve(struct sockaddr *src_sock,
 }
 #endif
 
+static bool is_dst_local(const struct dst_entry *dst)
+{
+	if (dst->ops->family == AF_INET)
+		return !!(dst_rtable(dst)->rt_type & RTN_LOCAL);
+	else if (dst->ops->family == AF_INET6)
+		return !!(dst_rt6_info(dst)->rt6i_flags & RTF_LOCAL);
+	else
+		return false;
+}
+
 static int addr_resolve_neigh(const struct dst_entry *dst,
 			      const struct sockaddr *dst_in,
 			      struct rdma_dev_addr *addr,
-			      unsigned int ndev_flags,
 			      u32 seq)
 {
-	int ret = 0;
-
-	if (ndev_flags & IFF_LOOPBACK) {
+	if (is_dst_local(dst)) {
+		/* When the destination is local entry, source and destination
+		 * are same. Skip the neighbour lookup.
+		 */
 		memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
-	} else {
-		if (!(ndev_flags & IFF_NOARP)) {
-			/* If the device doesn't do ARP internally */
-			ret = fetch_ha(dst, addr, dst_in, seq);
-		}
+		return 0;
 	}
-	return ret;
-}
-
-static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
-			    const struct sockaddr *dst_in,
-			    const struct dst_entry *dst,
-			    const struct net_device *ndev)
-{
-	int ret = 0;
-
-	if (dst->dev->flags & IFF_LOOPBACK)
-		ret = rdma_translate_ip(dst_in, dev_addr);
-	else
-		rdma_copy_src_l2_addr(dev_addr, dst->dev);
-
-	/*
-	 * If there's a gateway and type of device not ARPHRD_INFINIBAND,
-	 * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
-	 * network type accordingly.
-	 */
-	if (has_gateway(dst, dst_in->sa_family) &&
-	    ndev->type != ARPHRD_INFINIBAND)
-		dev_addr->network = dst_in->sa_family == AF_INET ?
-						RDMA_NETWORK_IPV4 :
-						RDMA_NETWORK_IPV6;
-	else
-		dev_addr->network = RDMA_NETWORK_IB;
 
-	return ret;
+	return fetch_ha(dst, addr, dst_in, seq);
 }
 
 static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
-				 unsigned int *ndev_flags,
 				 const struct sockaddr *dst_in,
 				 const struct dst_entry *dst)
 {
 	struct net_device *ndev = READ_ONCE(dst->dev);
 
-	*ndev_flags = ndev->flags;
 	/* A physical device must be the RDMA device to use */
-	if (ndev->flags & IFF_LOOPBACK) {
+	if (is_dst_local(dst)) {
+		int ret;
 		/*
 		 * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
 		 * loopback IP address. So if route is resolved to loopback
@@ -512,9 +490,27 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
 		ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
 		if (IS_ERR(ndev))
 			return -ENODEV;
+		ret = rdma_translate_ip(dst_in, dev_addr);
+		if (ret)
+			return ret;
+	} else {
+		rdma_copy_src_l2_addr(dev_addr, dst->dev);
 	}
 
-	return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+	/*
+	 * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+	 * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+	 * network type accordingly.
+	 */
+	if (has_gateway(dst, dst_in->sa_family) &&
+	    ndev->type != ARPHRD_INFINIBAND)
+		dev_addr->network = dst_in->sa_family == AF_INET ?
+						RDMA_NETWORK_IPV4 :
+						RDMA_NETWORK_IPV6;
+	else
+		dev_addr->network = RDMA_NETWORK_IB;
+
+	return 0;
 }
 
 static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
@@ -551,7 +547,6 @@ static int addr_resolve(struct sockaddr *src_in,
 			u32 seq)
 {
 	struct dst_entry *dst = NULL;
-	unsigned int ndev_flags = 0;
 	struct rtable *rt = NULL;
 	int ret;
 
@@ -588,7 +583,7 @@ static int addr_resolve(struct sockaddr *src_in,
 		rcu_read_unlock();
 		goto done;
 	}
-	ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+	ret = rdma_set_src_addr_rcu(addr, dst_in, dst);
 	rcu_read_unlock();
 
 	/*
@@ -596,7 +591,7 @@ static int addr_resolve(struct sockaddr *src_in,
 	 * only if src addr translation didn't fail.
 	 */
 	if (!ret && resolve_neigh)
-		ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
+		ret = addr_resolve_neigh(dst, dst_in, addr, seq);
 
 	if (src_in->sa_family == AF_INET)
 		ip_rt_put(rt);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 3bb46696731e..25a060a28301 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -110,8 +110,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *
 	agent = port_priv->agent[qpn];
 	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
 	if (IS_ERR(ah)) {
-		dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
-			PTR_ERR(ah));
+		dev_err(&device->dev, "ib_create_ah_from_wc error %pe\n", ah);
 		return;
 	}
 
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 92678e438ff4..01bede8ba105 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1049,8 +1049,8 @@ static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id,
 	struct cm_id_private *cm_id_priv;
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-	pr_err("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__,
-	       cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount));
+	pr_err_ratelimited("%s: cm_id=%p timed out. state %d -> %d, refcnt=%d\n", __func__,
+			   cm_id, old_state, cm_id->state, refcount_read(&cm_id_priv->refcount));
 }
 
 static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9b471548e7ae..5b2d3ae3f9fc 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2076,6 +2076,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
 	kfree(id_priv->id.route.path_rec);
 	kfree(id_priv->id.route.path_rec_inbound);
 	kfree(id_priv->id.route.path_rec_outbound);
+	kfree(id_priv->id.route.service_recs);
 
 	put_net(id_priv->id.route.addr.dev_addr.net);
 	kfree(id_priv);
@@ -3382,13 +3383,18 @@ err1:
 int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 {
 	struct rdma_id_private *id_priv;
+	enum rdma_cm_state state;
 	int ret;
 
 	if (!timeout_ms)
 		return -EINVAL;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
-	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+	state = id_priv->state;
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+			   RDMA_CM_ROUTE_QUERY) &&
+	    !cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_RESOLVED,
+			   RDMA_CM_ROUTE_QUERY))
 		return -EINVAL;
 
 	cma_id_get(id_priv);
@@ -3409,7 +3415,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 
 	return 0;
 err:
-	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, state);
 	cma_id_put(id_priv);
 	return ret;
 }
@@ -5506,3 +5512,129 @@ static void __exit cma_cleanup(void)
 
 module_init(cma_init);
 module_exit(cma_cleanup);
+
+static void cma_query_ib_service_handler(int status,
+					 struct sa_service_rec *recs,
+					 unsigned int num_recs, void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_id_private *id_priv = work->id;
+	struct sockaddr_ib *addr;
+
+	if (status)
+		goto fail;
+
+	if (!num_recs) {
+		status = -ENOENT;
+		goto fail;
+	}
+
+	if (id_priv->id.route.service_recs) {
+		status = -EALREADY;
+		goto fail;
+	}
+
+	id_priv->id.route.service_recs =
+		kmalloc_array(num_recs, sizeof(*recs), GFP_KERNEL);
+	if (!id_priv->id.route.service_recs) {
+		status = -ENOMEM;
+		goto fail;
+	}
+
+	id_priv->id.route.num_service_recs = num_recs;
+	memcpy(id_priv->id.route.service_recs, recs, sizeof(*recs) * num_recs);
+
+	addr = (struct sockaddr_ib *)&id_priv->id.route.addr.dst_addr;
+	addr->sib_family = AF_IB;
+	addr->sib_addr = *(struct ib_addr *)&recs->gid;
+	addr->sib_pkey = recs->pkey;
+	addr->sib_sid = recs->id;
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr,
+			   (union ib_gid *)&addr->sib_addr);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr,
+			 ntohs(addr->sib_pkey));
+
+	queue_work(cma_wq, &work->work);
+	return;
+
+fail:
+	work->old_state = RDMA_CM_ADDRINFO_QUERY;
+	work->new_state = RDMA_CM_ADDR_BOUND;
+	work->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR;
+	work->event.status = status;
+	pr_debug_ratelimited(
+		"RDMA CM: SERVICE_ERROR: failed to query service record. status %d\n",
+		status);
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_resolve_ib_service(struct rdma_id_private *id_priv,
+				  struct rdma_ucm_ib_service *ibs)
+{
+	struct sa_service_rec sr = {};
+	ib_sa_comp_mask mask = 0;
+	struct cma_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	cma_id_get(id_priv);
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDRINFO_QUERY;
+	work->new_state = RDMA_CM_ADDRINFO_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED;
+
+	if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_ID) {
+		sr.id = cpu_to_be64(ibs->service_id);
+		mask |= IB_SA_SERVICE_REC_SERVICE_ID;
+	}
+	if (ibs->flags & RDMA_USER_CM_IB_SERVICE_FLAG_NAME) {
+		strscpy(sr.name, ibs->service_name, sizeof(sr.name));
+		mask |= IB_SA_SERVICE_REC_SERVICE_NAME;
+	}
+
+	id_priv->query_id = ib_sa_service_rec_get(&sa_client,
+						  id_priv->id.device,
+						  id_priv->id.port_num,
+						  &sr, mask,
+						  2000, GFP_KERNEL,
+						  cma_query_ib_service_handler,
+						  work, &id_priv->query);
+
+	if (id_priv->query_id < 0) {
+		cma_id_put(id_priv);
+		kfree(work);
+		return id_priv->query_id;
+	}
+
+	return 0;
+}
+
+int rdma_resolve_ib_service(struct rdma_cm_id *id,
+			    struct rdma_ucm_ib_service *ibs)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cma_dev ||
+	    !cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDRINFO_QUERY))
+		return -EINVAL;
+
+	if (rdma_cap_ib_sa(id->device, id->port_num))
+		ret = cma_resolve_ib_service(id_priv, ibs);
+	else
+		ret = -EOPNOTSUPP;
+
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ADDRINFO_QUERY, RDMA_CM_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ib_service);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index b7354c94cf1b..c604b601f4d9 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -47,7 +47,9 @@ enum rdma_cm_state {
 	RDMA_CM_ADDR_BOUND,
 	RDMA_CM_LISTEN,
 	RDMA_CM_DEVICE_REMOVAL,
-	RDMA_CM_DESTROYING
+	RDMA_CM_DESTROYING,
+	RDMA_CM_ADDRINFO_QUERY,
+	RDMA_CM_ADDRINFO_RESOLVED
 };
 
 struct rdma_id_private {
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 3145cb34a1d2..b4f3c835844a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1543,7 +1543,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
 
 	/*
 	 * We have a registration lock so that all the calls to unregister are
-	 * fully fenced, once any unregister returns the device is truely
+	 * fully fenced, once any unregister returns the device is truly
 	 * unregistered even if multiple callers are unregistering it at the
 	 * same time. This also interacts with the registration flow and
 	 * provides sane semantics if register and unregister are racing.
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 53571e6b3162..c23e9c847314 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -107,6 +107,8 @@ struct ib_sa_device {
 struct ib_sa_query {
 	void (*callback)(struct ib_sa_query *sa_query, int status,
 			 struct ib_sa_mad *mad);
+	void (*rmpp_callback)(struct ib_sa_query *sa_query, int status,
+			      struct ib_mad_recv_wc *mad);
 	void (*release)(struct ib_sa_query *);
 	struct ib_sa_client    *client;
 	struct ib_sa_port      *port;
@@ -150,6 +152,13 @@ struct ib_sa_mcmember_query {
 	struct ib_sa_query sa_query;
 };
 
+struct ib_sa_service_query {
+	void (*callback)(int status, struct sa_service_rec *rec,
+			 unsigned int num_services, void *context);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
 static LIST_HEAD(ib_nl_request_list);
 static DEFINE_SPINLOCK(ib_nl_request_lock);
 static atomic_t ib_nl_sa_request_seq;
@@ -684,6 +693,58 @@ static const struct ib_field guidinfo_rec_table[] = {
 	  .size_bits    = 512 },
 };
 
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct sa_service_rec, field),     \
+	.struct_size_bytes   = sizeof_field(struct sa_service_rec, field), \
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 6,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 512 },
+	{ SERVICE_REC_FIELD(data_8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(data_16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(data_32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(data_64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+};
+
 #define RDMA_PRIMARY_PATH_MAX_REC_NUM 3
 
 static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
@@ -1013,6 +1074,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
 	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
 		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
 
+	spin_lock_irqsave(&ib_nl_request_lock, flags);
+
 	delta = timeout - sa_local_svc_timeout_ms;
 	if (delta < 0)
 		abs_delta = -delta;
@@ -1020,7 +1083,6 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
 		abs_delta = delta;
 
 	if (delta != 0) {
-		spin_lock_irqsave(&ib_nl_request_lock, flags);
 		sa_local_svc_timeout_ms = timeout;
 		list_for_each_entry(query, &ib_nl_request_list, list) {
 			if (delta < 0 && abs_delta > query->timeout)
@@ -1038,9 +1100,10 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
 		if (delay)
 			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
 					 (unsigned long)delay);
-		spin_unlock_irqrestore(&ib_nl_request_lock, flags);
 	}
 
+	spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
 settimeout_out:
 	return 0;
 }
@@ -1390,6 +1453,20 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
 }
 EXPORT_SYMBOL(ib_sa_pack_path);
 
+void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute)
+{
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec,
+		attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_service);
+
+void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec)
+{
+	ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute,
+		  rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_service);
+
 static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
 					 struct ib_sa_device *sa_dev,
 					 u32 port_num)
@@ -1479,6 +1556,68 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 	}
 }
 
+#define IB_SA_DATA_OFFS 56
+#define IB_SERVICE_REC_SZ 176
+
+static void ib_unpack_service_rmpp(struct sa_service_rec *rec,
+				   struct ib_mad_recv_wc *mad_wc,
+				   int num_services)
+{
+	unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0;
+	struct ib_mad_recv_buf *mad_buf;
+	u8 buf[IB_SERVICE_REC_SZ];
+	u8 *data;
+
+	data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data);
+
+	list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) {
+		data = ((struct ib_sa_mad *) mad_buf->mad)->data;
+		data_i = 0;
+		while (data_i < data_size && rec_i < num_services) {
+			cp_sz = min(IB_SERVICE_REC_SZ - buf_i,
+				    data_size - data_i);
+			memcpy(buf + buf_i, data + data_i, cp_sz);
+			data_i += cp_sz;
+			buf_i += cp_sz;
+			if (buf_i == IB_SERVICE_REC_SZ) {
+				ib_sa_unpack_service(buf, rec + rec_i);
+				buf_i = 0;
+				rec_i++;
+			}
+		}
+	}
+}
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status,
+				       struct ib_mad_recv_wc *mad_wc)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+	struct sa_service_rec *rec;
+	int num_services;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad) {
+		query->callback(status, NULL, 0, query->context);
+		return;
+	}
+
+	num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ;
+	if (!num_services) {
+		query->callback(-ENODATA, NULL, 0, query->context);
+		return;
+	}
+
+	rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL);
+	if (!rec) {
+		query->callback(-ENOMEM, NULL, 0, query->context);
+		return;
+	}
+
+	ib_unpack_service_rmpp(rec, mad_wc, num_services);
+	query->callback(status, rec, num_services, query->context);
+	kfree(rec);
+}
+
 static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
 {
 	struct ib_sa_path_query *query =
@@ -1488,6 +1627,14 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
 	kfree(query);
 }
 
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	kfree(query);
+}
+
 /**
  * ib_sa_path_rec_get - Start a Path get query
  * @client:SA client
@@ -1618,6 +1765,101 @@ err1:
 }
 EXPORT_SYMBOL(ib_sa_path_rec_get);
 
+/**
+ * ib_sa_service_rec_get - Start a Service get query
+ * @client: SA client
+ * @device: device to send query on
+ * @port_num: port number to send query on
+ * @rec: Service Record to send in query
+ * @comp_mask: component mask to send in query
+ * @timeout_ms: time to wait for response
+ * @gfp_mask: GFP mask to use for internal allocations
+ * @callback: function called when query completes, times out or is
+ * canceled
+ * @context: opaque user context passed to callback
+ * @sa_query: query context, used to cancel query
+ *
+ * Send a Service Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_get(struct ib_sa_client *client,
+			  struct ib_device *device, u32 port_num,
+			  struct sa_service_rec *rec,
+			  ib_sa_comp_mask comp_mask,
+			  unsigned long timeout_ms, gfp_t gfp_mask,
+			  void (*callback)(int status,
+					   struct sa_service_rec *resp,
+					   unsigned int num_services,
+					   void *context),
+			  void *context, struct ib_sa_query **sa_query)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_service_query *query;
+	struct ib_mad_agent *agent;
+	struct ib_sa_port   *port;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kzalloc(sizeof(*query), gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(&query->sa_query, agent);
+
+	query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback :
+		NULL;
+	query->sa_query.release = ib_sa_service_rec_release;
+	mad->mad_hdr.method	= IB_MGMT_METHOD_GET_TABLE;
+	mad->mad_hdr.attr_id	= cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	= comp_mask;
+
+	ib_sa_pack_service(rec, mad->data);
+
+	*sa_query = &query->sa_query;
+	query->sa_query.mad_buf->context[1] = rec;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_get);
+
 static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
 					int status, struct ib_sa_mad *mad)
 {
@@ -1987,23 +2229,29 @@ static void send_handler(struct ib_mad_agent *agent,
 {
 	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
 	unsigned long flags;
+	int status = 0;
 
-	if (query->callback)
+	if (query->callback || query->rmpp_callback) {
 		switch (mad_send_wc->status) {
 		case IB_WC_SUCCESS:
 			/* No callback -- already got recv */
 			break;
 		case IB_WC_RESP_TIMEOUT_ERR:
-			query->callback(query, -ETIMEDOUT, NULL);
+			status = -ETIMEDOUT;
 			break;
 		case IB_WC_WR_FLUSH_ERR:
-			query->callback(query, -EINTR, NULL);
+			status = -EINTR;
 			break;
 		default:
-			query->callback(query, -EIO, NULL);
+			status = -EIO;
 			break;
 		}
 
+		if (status)
+			query->callback ? query->callback(query, status, NULL) :
+				query->rmpp_callback(query, status, NULL);
+	}
+
 	xa_lock_irqsave(&queries, flags);
 	__xa_erase(&queries, query->id);
 	xa_unlock_irqrestore(&queries, flags);
@@ -2019,17 +2267,25 @@ static void recv_handler(struct ib_mad_agent *mad_agent,
 			 struct ib_mad_recv_wc *mad_recv_wc)
 {
 	struct ib_sa_query *query;
+	struct ib_mad *mad;
+
 
 	if (!send_buf)
 		return;
 
 	query = send_buf->context[0];
-	if (query->callback) {
+	mad = mad_recv_wc->recv_buf.mad;
+
+	if (query->rmpp_callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->rmpp_callback(query, mad->mad_hdr.status ?
+					     -EINVAL : 0, mad_recv_wc);
+		else
+			query->rmpp_callback(query, -EIO, NULL);
+	} else if (query->callback) {
 		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
-			query->callback(query,
-					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
-					-EINVAL : 0,
-					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+			query->callback(query, mad->mad_hdr.status ?
+					-EINVAL : 0, (struct ib_sa_mad *)mad);
 		else
 			query->callback(query, -EIO, NULL);
 	}
@@ -2181,8 +2437,9 @@ static int ib_sa_add_one(struct ib_device *device)
 
 		sa_dev->port[i].agent =
 			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
-					      NULL, 0, send_handler,
-					      recv_handler, sa_dev, 0);
+					      NULL, IB_MGMT_RMPP_VERSION,
+					      send_handler, recv_handler,
+					      sa_dev, 0);
 		if (IS_ERR(sa_dev->port[i].agent)) {
 			ret = PTR_ERR(sa_dev->port[i].agent);
 			goto err;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 6e700b974033..f86ece701db6 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -282,6 +282,10 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
 	}
 	uevent->resp.event = event->event;
 	uevent->resp.status = event->status;
+
+	if (event->event == RDMA_CM_EVENT_ADDRINFO_RESOLVED)
+		goto out;
+
 	if (ctx->cm_id->qp_type == IB_QPT_UD)
 		ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud,
 				   &event->param.ud);
@@ -289,6 +293,7 @@ static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
 		ucma_copy_conn_event(&uevent->resp.param.conn,
 				     &event->param.conn);
 
+out:
 	uevent->resp.ece.vendor_id = event->ece.vendor_id;
 	uevent->resp.ece.attr_mod = event->ece.attr_mod;
 	return uevent;
@@ -728,6 +733,28 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file,
 	return ret;
 }
 
+static ssize_t ucma_resolve_ib_service(struct ucma_file *file,
+				       const char __user *inbuf, int in_len,
+				       int out_len)
+{
+	struct rdma_ucm_resolve_ib_service cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&ctx->mutex);
+	ret = rdma_resolve_ib_service(ctx->cm_id, &cmd.ibs);
+	mutex_unlock(&ctx->mutex);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
 static ssize_t ucma_resolve_route(struct ucma_file *file,
 				  const char __user *inbuf,
 				  int in_len, int out_len)
@@ -994,6 +1021,43 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx,
 	return ret;
 }
 
+static ssize_t ucma_query_ib_service(struct ucma_context *ctx,
+				     void __user *response, int out_len)
+{
+	struct rdma_ucm_query_ib_service_resp *resp;
+	int n, ret = 0;
+
+	if (out_len < sizeof(struct rdma_ucm_query_ib_service_resp))
+		return -ENOSPC;
+
+	if (!ctx->cm_id->route.service_recs)
+		return -ENODATA;
+
+	resp = kzalloc(out_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	resp->num_service_recs = ctx->cm_id->route.num_service_recs;
+
+	n = (out_len - sizeof(struct rdma_ucm_query_ib_service_resp)) /
+		sizeof(struct ib_user_service_rec);
+
+	if (!n)
+		goto out;
+
+	if (n > ctx->cm_id->route.num_service_recs)
+		n = ctx->cm_id->route.num_service_recs;
+
+	memcpy(resp->recs, ctx->cm_id->route.service_recs,
+	       sizeof(*resp->recs) * n);
+	if (copy_to_user(response, resp, struct_size(resp, recs, n)))
+		ret = -EFAULT;
+
+out:
+	kfree(resp);
+	return ret;
+}
+
 static ssize_t ucma_query(struct ucma_file *file,
 			  const char __user *inbuf,
 			  int in_len, int out_len)
@@ -1022,6 +1086,9 @@ static ssize_t ucma_query(struct ucma_file *file,
 	case RDMA_USER_CM_QUERY_GID:
 		ret = ucma_query_gid(ctx, response, out_len);
 		break;
+	case RDMA_USER_CM_QUERY_IB_SERVICE:
+		ret = ucma_query_ib_service(ctx, response, out_len);
+		break;
 	default:
 		ret = -ENOSYS;
 		break;
@@ -1678,6 +1745,55 @@ err_unlock:
 	return ret;
 }
 
+static ssize_t ucma_write_cm_event(struct ucma_file *file,
+				   const char __user *inbuf, int in_len,
+				   int out_len)
+{
+	struct rdma_ucm_write_cm_event cmd;
+	struct rdma_cm_event event = {};
+	struct ucma_event *uevent;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if ((cmd.event != RDMA_CM_EVENT_USER) &&
+	    (cmd.event != RDMA_CM_EVENT_INTERNAL))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	event.event = cmd.event;
+	event.status = cmd.status;
+	event.param.arg = cmd.param.arg;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	uevent->ctx = ctx;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event.event;
+	uevent->resp.status = event.status;
+	memcpy(uevent->resp.param.arg32, &event.param.arg,
+	       sizeof(event.param.arg));
+
+	mutex_lock(&ctx->file->mut);
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	mutex_unlock(&ctx->file->mut);
+	wake_up_interruptible(&ctx->file->poll_wait);
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
 static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 				   const char __user *inbuf,
 				   int in_len, int out_len) = {
@@ -1703,7 +1819,9 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 	[RDMA_USER_CM_CMD_QUERY]	 = ucma_query,
 	[RDMA_USER_CM_CMD_BIND]		 = ucma_bind,
 	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	 = ucma_resolve_addr,
-	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast,
+	[RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE] = ucma_resolve_ib_service,
+	[RDMA_USER_CM_CMD_WRITE_CM_EVENT] = ucma_write_cm_event,
 };
 
 static ssize_t ucma_write(struct file *filp, const char __user *buf,
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index df61b2299ec0..b706dc0d0263 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_INFINIBAND_HNS_HIP08)	+= hns/
 obj-$(CONFIG_INFINIBAND_QEDR)		+= qedr/
 obj-$(CONFIG_INFINIBAND_BNXT_RE)	+= bnxt_re/
 obj-$(CONFIG_INFINIBAND_ERDMA)		+= erdma/
+obj-$(CONFIG_INFINIBAND_IONIC)		+= ionic/
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 6df5a2738c95..3485e495ac6a 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -172,9 +172,9 @@ struct bnxt_re_dev {
 	struct list_head		list;
 	unsigned long			flags;
 #define BNXT_RE_FLAG_NETDEV_REGISTERED		0
+#define BNXT_RE_FLAG_STATS_CTX3_ALLOC		1
 #define BNXT_RE_FLAG_HAVE_L2_REF		3
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
-#define BNXT_RE_FLAG_QOS_WORK_REG		5
 #define BNXT_RE_FLAG_RESOURCES_ALLOCATED	7
 #define BNXT_RE_FLAG_RESOURCES_INITIALIZED	8
 #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED       17
@@ -187,9 +187,6 @@ struct bnxt_re_dev {
 
 	int				id;
 
-	struct delayed_work		worker;
-	u8				cur_prio_map;
-
 	/* RCFW Channel */
 	struct bnxt_qplib_rcfw		rcfw;
 
@@ -227,6 +224,13 @@ struct bnxt_re_dev {
 	struct workqueue_struct		*dcb_wq;
 	struct dentry                   *cc_config;
 	struct bnxt_re_dbg_cc_config_params *cc_config_params;
+#define BNXT_VPD_FLD_LEN		32
+	char board_partno[BNXT_VPD_FLD_LEN];
+	/* RoCE mirror */
+	u16 mirror_vnic_id;
+	union ib_gid ugid;
+	u32 ugid_index;
+	u8 sniffer_flow_created : 1;
 };
 
 #define to_bnxt_re_dev(ptr, member)	\
@@ -243,6 +247,10 @@ int bnxt_re_assign_pma_port_counters(struct bnxt_re_dev *rdev, struct ib_mad *ou
 int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev,
 					 struct ib_mad *out_mad);
 
+void bnxt_re_hwrm_free_vnic(struct bnxt_re_dev *rdev);
+int bnxt_re_hwrm_alloc_vnic(struct bnxt_re_dev *rdev);
+int bnxt_re_hwrm_cfg_vnic(struct bnxt_re_dev *rdev, u32 qp_id);
+
 static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev)
 {
 	if (rdev)
@@ -276,4 +284,7 @@ static inline int bnxt_re_read_context_allowed(struct bnxt_re_dev *rdev)
 #define BNXT_RE_CONTEXT_TYPE_MRW_SIZE_P7	192
 #define BNXT_RE_CONTEXT_TYPE_SRQ_SIZE_P7	192
 
+#define BNXT_RE_HWRM_CMD_TIMEOUT(rdev)		\
+		((rdev)->chip_ctx->hwrm_cmd_max_timeout * 1000)
+
 #endif
diff --git a/drivers/infiniband/hw/bnxt_re/debugfs.c b/drivers/infiniband/hw/bnxt_re/debugfs.c
index e632f1661b92..be5e9b5ca2f0 100644
--- a/drivers/infiniband/hw/bnxt_re/debugfs.c
+++ b/drivers/infiniband/hw/bnxt_re/debugfs.c
@@ -8,6 +8,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/pci.h>
+#include <linux/seq_file.h>
 #include <rdma/ib_addr.h>
 
 #include "bnxt_ulp.h"
@@ -314,6 +315,40 @@ static const struct file_operations bnxt_re_cc_config_ops = {
 	.write = bnxt_re_cc_config_set,
 };
 
+static int info_show(struct seq_file *m, void *unused)
+{
+	struct bnxt_re_dev *rdev = m->private;
+	struct bnxt_re_res_cntrs *res_s = &rdev->stats.res;
+
+	seq_puts(m, "Info:\n");
+	seq_printf(m, "Device Name\t\t: %s\n", dev_name(&rdev->ibdev.dev));
+	seq_printf(m, "PD Watermark\t\t: %llu\n", res_s->pd_watermark);
+	seq_printf(m, "AH Watermark\t\t: %llu\n", res_s->ah_watermark);
+	seq_printf(m, "QP Watermark\t\t: %llu\n", res_s->qp_watermark);
+	seq_printf(m, "RC QP Watermark\t\t: %llu\n", res_s->rc_qp_watermark);
+	seq_printf(m, "UD QP Watermark\t\t: %llu\n", res_s->ud_qp_watermark);
+	seq_printf(m, "SRQ Watermark\t\t: %llu\n", res_s->srq_watermark);
+	seq_printf(m, "CQ Watermark\t\t: %llu\n", res_s->cq_watermark);
+	seq_printf(m, "MR Watermark\t\t: %llu\n", res_s->mr_watermark);
+	seq_printf(m, "MW Watermark\t\t: %llu\n", res_s->mw_watermark);
+	seq_printf(m, "CQ Resize Count\t\t: %d\n", atomic_read(&res_s->resize_count));
+	if (rdev->pacing.dbr_pacing) {
+		seq_printf(m, "DB Pacing Reschedule\t: %llu\n", rdev->stats.pacing.resched);
+		seq_printf(m, "DB Pacing Complete\t: %llu\n", rdev->stats.pacing.complete);
+		seq_printf(m, "DB Pacing Alerts\t: %llu\n", rdev->stats.pacing.alerts);
+		seq_printf(m, "DB FIFO Register\t: 0x%x\n",
+			   readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off));
+	}
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(info);
+
+static void bnxt_re_debugfs_add_info(struct bnxt_re_dev *rdev)
+{
+	debugfs_create_file("info", 0400, rdev->dbg_root, rdev, &info_fops);
+}
+
 void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
 {
 	struct pci_dev *pdev = rdev->en_dev->pdev;
@@ -325,6 +360,8 @@ void bnxt_re_debugfs_add_pdev(struct bnxt_re_dev *rdev)
 	rdev->qp_debugfs = debugfs_create_dir("QPs", rdev->dbg_root);
 	rdev->cc_config = debugfs_create_dir("cc_config", rdev->dbg_root);
 
+	bnxt_re_debugfs_add_info(rdev);
+
 	rdev->cc_config_params = kzalloc(sizeof(*cc_params), GFP_KERNEL);
 
 	for (i = 0; i < BNXT_RE_CC_PARAM_GEN0; i++) {
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c
index 44bb082e0a60..651cf9d0e0c7 100644
--- a/drivers/infiniband/hw/bnxt_re/hw_counters.c
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c
@@ -51,25 +51,6 @@
 #include "hw_counters.h"
 
 static const struct rdma_stat_desc bnxt_re_stat_descs[] = {
-	[BNXT_RE_ACTIVE_PD].name		=  "active_pds",
-	[BNXT_RE_ACTIVE_AH].name		=  "active_ahs",
-	[BNXT_RE_ACTIVE_QP].name		=  "active_qps",
-	[BNXT_RE_ACTIVE_RC_QP].name             =  "active_rc_qps",
-	[BNXT_RE_ACTIVE_UD_QP].name             =  "active_ud_qps",
-	[BNXT_RE_ACTIVE_SRQ].name		=  "active_srqs",
-	[BNXT_RE_ACTIVE_CQ].name		=  "active_cqs",
-	[BNXT_RE_ACTIVE_MR].name		=  "active_mrs",
-	[BNXT_RE_ACTIVE_MW].name		=  "active_mws",
-	[BNXT_RE_WATERMARK_PD].name             =  "watermark_pds",
-	[BNXT_RE_WATERMARK_AH].name             =  "watermark_ahs",
-	[BNXT_RE_WATERMARK_QP].name             =  "watermark_qps",
-	[BNXT_RE_WATERMARK_RC_QP].name          =  "watermark_rc_qps",
-	[BNXT_RE_WATERMARK_UD_QP].name          =  "watermark_ud_qps",
-	[BNXT_RE_WATERMARK_SRQ].name            =  "watermark_srqs",
-	[BNXT_RE_WATERMARK_CQ].name             =  "watermark_cqs",
-	[BNXT_RE_WATERMARK_MR].name             =  "watermark_mrs",
-	[BNXT_RE_WATERMARK_MW].name             =  "watermark_mws",
-	[BNXT_RE_RESIZE_CQ_CNT].name            =  "resize_cq_cnt",
 	[BNXT_RE_RX_PKTS].name		=  "rx_pkts",
 	[BNXT_RE_RX_BYTES].name		=  "rx_bytes",
 	[BNXT_RE_TX_PKTS].name		=  "tx_pkts",
@@ -79,22 +60,22 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = {
 	[BNXT_RE_TX_DISCARDS].name              =  "tx_roce_discards",
 	[BNXT_RE_RX_ERRORS].name		=  "rx_roce_errors",
 	[BNXT_RE_RX_DISCARDS].name		=  "rx_roce_discards",
-	[BNXT_RE_TO_RETRANSMITS].name        = "to_retransmits",
-	[BNXT_RE_SEQ_ERR_NAKS_RCVD].name     = "seq_err_naks_rcvd",
-	[BNXT_RE_MAX_RETRY_EXCEEDED].name    = "max_retry_exceeded",
-	[BNXT_RE_RNR_NAKS_RCVD].name         = "rnr_naks_rcvd",
-	[BNXT_RE_MISSING_RESP].name          = "missing_resp",
+	[BNXT_RE_TO_RETRANSMITS].name           =  "local_ack_timeout_err",
+	[BNXT_RE_SEQ_ERR_NAKS_RCVD].name        =  "packet_seq_err",
+	[BNXT_RE_MAX_RETRY_EXCEEDED].name	=  "max_retry_exceeded",
+	[BNXT_RE_RNR_NAKS_RCVD].name            =  "rnr_nak_retry_err",
+	[BNXT_RE_MISSING_RESP].name             =  "implied_nak_seq_err",
 	[BNXT_RE_UNRECOVERABLE_ERR].name     = "unrecoverable_err",
 	[BNXT_RE_BAD_RESP_ERR].name          = "bad_resp_err",
 	[BNXT_RE_LOCAL_QP_OP_ERR].name       = "local_qp_op_err",
 	[BNXT_RE_LOCAL_PROTECTION_ERR].name  = "local_protection_err",
 	[BNXT_RE_MEM_MGMT_OP_ERR].name       = "mem_mgmt_op_err",
-	[BNXT_RE_REMOTE_INVALID_REQ_ERR].name = "remote_invalid_req_err",
-	[BNXT_RE_REMOTE_ACCESS_ERR].name     = "remote_access_err",
+	[BNXT_RE_REMOTE_INVALID_REQ_ERR].name   = "req_remote_invalid_request",
+	[BNXT_RE_REMOTE_ACCESS_ERR].name        = "req_remote_access_errors",
 	[BNXT_RE_REMOTE_OP_ERR].name         = "remote_op_err",
-	[BNXT_RE_DUP_REQ].name               = "dup_req",
+	[BNXT_RE_DUP_REQ].name               = "duplicate_request",
 	[BNXT_RE_RES_EXCEED_MAX].name        = "res_exceed_max",
-	[BNXT_RE_RES_LENGTH_MISMATCH].name   = "res_length_mismatch",
+	[BNXT_RE_RES_LENGTH_MISMATCH].name   = "resp_local_length_error",
 	[BNXT_RE_RES_EXCEEDS_WQE].name       = "res_exceeds_wqe",
 	[BNXT_RE_RES_OPCODE_ERR].name        = "res_opcode_err",
 	[BNXT_RE_RES_RX_INVALID_RKEY].name   = "res_rx_invalid_rkey",
@@ -118,7 +99,7 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = {
 	[BNXT_RE_RES_SRQ_LOAD_ERR].name      = "res_srq_load_err",
 	[BNXT_RE_RES_TX_PCI_ERR].name        = "res_tx_pci_err",
 	[BNXT_RE_RES_RX_PCI_ERR].name        = "res_rx_pci_err",
-	[BNXT_RE_OUT_OF_SEQ_ERR].name        = "oos_drop_count",
+	[BNXT_RE_OUT_OF_SEQ_ERR].name        = "out_of_sequence",
 	[BNXT_RE_TX_ATOMIC_REQ].name	     = "tx_atomic_req",
 	[BNXT_RE_TX_READ_REQ].name	     = "tx_read_req",
 	[BNXT_RE_TX_READ_RES].name	     = "tx_read_resp",
@@ -126,23 +107,22 @@ static const struct rdma_stat_desc bnxt_re_stat_descs[] = {
 	[BNXT_RE_TX_SEND_REQ].name	     = "tx_send_req",
 	[BNXT_RE_TX_ROCE_PKTS].name          = "tx_roce_only_pkts",
 	[BNXT_RE_TX_ROCE_BYTES].name         = "tx_roce_only_bytes",
-	[BNXT_RE_RX_ATOMIC_REQ].name	     = "rx_atomic_req",
-	[BNXT_RE_RX_READ_REQ].name	     = "rx_read_req",
+	[BNXT_RE_RX_ATOMIC_REQ].name	     = "rx_atomic_requests",
+	[BNXT_RE_RX_READ_REQ].name	     = "rx_read_requests",
 	[BNXT_RE_RX_READ_RESP].name	     = "rx_read_resp",
-	[BNXT_RE_RX_WRITE_REQ].name	     = "rx_write_req",
+	[BNXT_RE_RX_WRITE_REQ].name	     = "rx_write_requests",
 	[BNXT_RE_RX_SEND_REQ].name	     = "rx_send_req",
 	[BNXT_RE_RX_ROCE_PKTS].name          = "rx_roce_only_pkts",
 	[BNXT_RE_RX_ROCE_BYTES].name         = "rx_roce_only_bytes",
 	[BNXT_RE_RX_ROCE_GOOD_PKTS].name     = "rx_roce_good_pkts",
 	[BNXT_RE_RX_ROCE_GOOD_BYTES].name    = "rx_roce_good_bytes",
-	[BNXT_RE_OOB].name		     = "rx_out_of_buffer",
-	[BNXT_RE_TX_CNP].name                = "tx_cnp_pkts",
-	[BNXT_RE_RX_CNP].name                = "rx_cnp_pkts",
-	[BNXT_RE_RX_ECN].name                = "rx_ecn_marked_pkts",
-	[BNXT_RE_PACING_RESCHED].name        = "pacing_reschedule",
-	[BNXT_RE_PACING_CMPL].name           = "pacing_complete",
-	[BNXT_RE_PACING_ALERT].name          = "pacing_alerts",
-	[BNXT_RE_DB_FIFO_REG].name           = "db_fifo_register",
+	[BNXT_RE_OOB].name		     = "out_of_buffer",
+	[BNXT_RE_TX_CNP].name                = "np_cnp_pkts",
+	[BNXT_RE_RX_CNP].name                = "rp_cnp_handled",
+	[BNXT_RE_RX_ECN].name                = "np_ecn_marked_roce_packets",
+	[BNXT_RE_REQ_CQE_ERROR].name            = "req_cqe_error",
+	[BNXT_RE_RESP_CQE_ERROR].name           = "resp_cqe_error",
+	[BNXT_RE_RESP_REMOTE_ACCESS_ERRS].name  = "resp_remote_access_errors",
 };
 
 static void bnxt_re_copy_ext_stats(struct bnxt_re_dev *rdev,
@@ -273,18 +253,20 @@ static void bnxt_re_copy_err_stats(struct bnxt_re_dev *rdev,
 			err_s->res_rx_pci_err;
 	stats->value[BNXT_RE_OUT_OF_SEQ_ERR]    =
 			err_s->res_oos_drop_count;
-}
-
-static void bnxt_re_copy_db_pacing_stats(struct bnxt_re_dev *rdev,
-					 struct rdma_hw_stats *stats)
-{
-	struct bnxt_re_db_pacing_stats *pacing_s =  &rdev->stats.pacing;
-
-	stats->value[BNXT_RE_PACING_RESCHED] = pacing_s->resched;
-	stats->value[BNXT_RE_PACING_CMPL] = pacing_s->complete;
-	stats->value[BNXT_RE_PACING_ALERT] = pacing_s->alerts;
-	stats->value[BNXT_RE_DB_FIFO_REG] =
-		readl(rdev->en_dev->bar0 + rdev->pacing.dbr_db_fifo_reg_off);
+	stats->value[BNXT_RE_REQ_CQE_ERROR]     =
+			err_s->bad_resp_err +
+			err_s->local_qp_op_err +
+			err_s->local_protection_err +
+			err_s->mem_mgmt_op_err +
+			err_s->remote_invalid_req_err +
+			err_s->remote_access_err +
+			err_s->remote_op_err;
+	stats->value[BNXT_RE_RESP_CQE_ERROR] =
+			err_s->res_cmp_err +
+			err_s->res_cq_load_err;
+	stats->value[BNXT_RE_RESP_REMOTE_ACCESS_ERRS] =
+			err_s->res_rx_no_perm +
+			err_s->res_tx_no_perm;
 }
 
 int bnxt_re_assign_pma_port_ext_counters(struct bnxt_re_dev *rdev, struct ib_mad *out_mad)
@@ -382,7 +364,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 			    u32 port, int index)
 {
 	struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-	struct bnxt_re_res_cntrs *res_s = &rdev->stats.res;
 	struct bnxt_qplib_roce_stats *err_s = NULL;
 	struct ctx_hw_stats *hw_stats = NULL;
 	int rc  = 0;
@@ -391,26 +372,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 	if (!port || !stats)
 		return -EINVAL;
 
-	stats->value[BNXT_RE_ACTIVE_QP] = atomic_read(&res_s->qp_count);
-	stats->value[BNXT_RE_ACTIVE_RC_QP] = atomic_read(&res_s->rc_qp_count);
-	stats->value[BNXT_RE_ACTIVE_UD_QP] = atomic_read(&res_s->ud_qp_count);
-	stats->value[BNXT_RE_ACTIVE_SRQ] = atomic_read(&res_s->srq_count);
-	stats->value[BNXT_RE_ACTIVE_CQ] = atomic_read(&res_s->cq_count);
-	stats->value[BNXT_RE_ACTIVE_MR] = atomic_read(&res_s->mr_count);
-	stats->value[BNXT_RE_ACTIVE_MW] = atomic_read(&res_s->mw_count);
-	stats->value[BNXT_RE_ACTIVE_PD] = atomic_read(&res_s->pd_count);
-	stats->value[BNXT_RE_ACTIVE_AH] = atomic_read(&res_s->ah_count);
-	stats->value[BNXT_RE_WATERMARK_QP] = res_s->qp_watermark;
-	stats->value[BNXT_RE_WATERMARK_RC_QP] = res_s->rc_qp_watermark;
-	stats->value[BNXT_RE_WATERMARK_UD_QP] = res_s->ud_qp_watermark;
-	stats->value[BNXT_RE_WATERMARK_SRQ] = res_s->srq_watermark;
-	stats->value[BNXT_RE_WATERMARK_CQ] = res_s->cq_watermark;
-	stats->value[BNXT_RE_WATERMARK_MR] = res_s->mr_watermark;
-	stats->value[BNXT_RE_WATERMARK_MW] = res_s->mw_watermark;
-	stats->value[BNXT_RE_WATERMARK_PD] = res_s->pd_watermark;
-	stats->value[BNXT_RE_WATERMARK_AH] = res_s->ah_watermark;
-	stats->value[BNXT_RE_RESIZE_CQ_CNT] = atomic_read(&res_s->resize_count);
-
 	if (hw_stats) {
 		stats->value[BNXT_RE_RECOVERABLE_ERRORS] =
 			le64_to_cpu(hw_stats->tx_bcast_pkts);
@@ -449,8 +410,6 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev,
 				goto done;
 			}
 		}
-		if (rdev->pacing.dbr_pacing && bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx))
-			bnxt_re_copy_db_pacing_stats(rdev, stats);
 	}
 
 done:
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h
index e541b6f8ca9f..09d371d442aa 100644
--- a/drivers/infiniband/hw/bnxt_re/hw_counters.h
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h
@@ -41,25 +41,6 @@
 #define __BNXT_RE_HW_STATS_H__
 
 enum bnxt_re_hw_stats {
-	BNXT_RE_ACTIVE_PD,
-	BNXT_RE_ACTIVE_AH,
-	BNXT_RE_ACTIVE_QP,
-	BNXT_RE_ACTIVE_RC_QP,
-	BNXT_RE_ACTIVE_UD_QP,
-	BNXT_RE_ACTIVE_SRQ,
-	BNXT_RE_ACTIVE_CQ,
-	BNXT_RE_ACTIVE_MR,
-	BNXT_RE_ACTIVE_MW,
-	BNXT_RE_WATERMARK_PD,
-	BNXT_RE_WATERMARK_AH,
-	BNXT_RE_WATERMARK_QP,
-	BNXT_RE_WATERMARK_RC_QP,
-	BNXT_RE_WATERMARK_UD_QP,
-	BNXT_RE_WATERMARK_SRQ,
-	BNXT_RE_WATERMARK_CQ,
-	BNXT_RE_WATERMARK_MR,
-	BNXT_RE_WATERMARK_MW,
-	BNXT_RE_RESIZE_CQ_CNT,
 	BNXT_RE_RX_PKTS,
 	BNXT_RE_RX_BYTES,
 	BNXT_RE_TX_PKTS,
@@ -129,10 +110,9 @@ enum bnxt_re_hw_stats {
 	BNXT_RE_TX_CNP,
 	BNXT_RE_RX_CNP,
 	BNXT_RE_RX_ECN,
-	BNXT_RE_PACING_RESCHED,
-	BNXT_RE_PACING_CMPL,
-	BNXT_RE_PACING_ALERT,
-	BNXT_RE_DB_FIFO_REG,
+	BNXT_RE_REQ_CQE_ERROR,
+	BNXT_RE_RESP_CQE_ERROR,
+	BNXT_RE_RESP_REMOTE_ACCESS_ERRS,
 	BNXT_RE_NUM_EXT_COUNTERS
 };
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 260dc67b8b87..4dab5ca7362b 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -288,7 +288,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num,
 	}
 	port_attr->max_mtu = IB_MTU_4096;
 	port_attr->active_mtu = iboe_get_mtu(rdev->netdev->mtu);
-	port_attr->gid_tbl_len = dev_attr->max_sgid;
+	/* One GID is reserved for RawEth QP. Report one less */
+	port_attr->gid_tbl_len = (rdev->rcfw.roce_mirror ? (dev_attr->max_sgid - 1) :
+				  dev_attr->max_sgid);
 	port_attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
 				    IB_PORT_DEVICE_MGMT_SUP |
 				    IB_PORT_VENDOR_CLASS_SUP;
@@ -375,7 +377,7 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context)
 	if (!ctx)
 		return -EINVAL;
 
-	if (sgid_tbl && sgid_tbl->active) {
+	if (sgid_tbl->active) {
 		if (ctx->idx >= sgid_tbl->max)
 			return -EINVAL;
 		gid_to_del = &sgid_tbl->tbl[ctx->idx].gid;
@@ -429,7 +431,7 @@ int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context)
 
 	rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)&attr->gid,
 				 rdev->qplib_res.netdev->dev_addr,
-				 vlan_id, true, &tbl_idx);
+				 vlan_id, true, &tbl_idx, false, 0);
 	if (rc == -EALREADY) {
 		ctx_tbl = sgid_tbl->ctx;
 		ctx_tbl[tbl_idx]->refcnt++;
@@ -955,6 +957,20 @@ fail:
 	return rc;
 }
 
+static void bnxt_re_del_unique_gid(struct bnxt_re_dev *rdev)
+{
+	int rc;
+
+	if (!rdev->rcfw.roce_mirror)
+		return;
+
+	rc = bnxt_qplib_del_sgid(&rdev->qplib_res.sgid_tbl,
+				 (struct bnxt_qplib_gid *)&rdev->ugid,
+				 0xFFFF, true);
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Failed to delete unique GID, rc: %d\n", rc);
+}
+
 /* Queue Pairs */
 int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
@@ -994,6 +1010,9 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 	else if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_UD)
 		atomic_dec(&rdev->stats.res.ud_qp_count);
 
+	if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE)
+		bnxt_re_del_unique_gid(rdev);
+
 	ib_umem_release(qp->rumem);
 	ib_umem_release(qp->sumem);
 
@@ -1018,6 +1037,8 @@ static u8 __from_ib_qp_type(enum ib_qp_type type)
 		return CMDQ_CREATE_QP_TYPE_RC;
 	case IB_QPT_UD:
 		return CMDQ_CREATE_QP_TYPE_UD;
+	case IB_QPT_RAW_PACKET:
+		return CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE;
 	default:
 		return IB_QPT_MAX;
 	}
@@ -1595,6 +1616,29 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev,
 	return rc;
 }
 
+static int bnxt_re_add_unique_gid(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx;
+	struct bnxt_qplib_res *res = &rdev->qplib_res;
+	int rc;
+
+	if (!rdev->rcfw.roce_mirror)
+		return 0;
+
+	rdev->ugid.global.subnet_prefix = cpu_to_be64(0xfe8000000000abcdLL);
+	addrconf_ifid_eui48(&rdev->ugid.raw[8], rdev->netdev);
+
+	rc = bnxt_qplib_add_sgid(&res->sgid_tbl,
+				 (struct bnxt_qplib_gid *)&rdev->ugid,
+				 rdev->qplib_res.netdev->dev_addr,
+				 0xFFFF, true, &rdev->ugid_index, true,
+				 hctx->stats3.fw_id);
+	if (rc)
+		dev_err(rdev_to_dev(rdev), "Failed to add unique GID. rc = %d\n", rc);
+
+	return rc;
+}
+
 int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr,
 		      struct ib_udata *udata)
 {
@@ -1656,6 +1700,17 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr,
 		}
 	}
 
+	/* Support for RawEth QP is added to capture TCP pkt dump.
+	 * So unique SGID is used to avoid incorrect statistics on per
+	 * function stats_ctx
+	 */
+	if (qp->qplib_qp.type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE) {
+		rc = bnxt_re_add_unique_gid(rdev);
+		if (rc)
+			goto qp_destroy;
+		qp->qplib_qp.ugid_index = rdev->ugid_index;
+	}
+
 	qp->ib_qp.qp_num = qp->qplib_qp.id;
 	if (qp_init_attr->qp_type == IB_QPT_GSI)
 		rdev->gsi_ctx.gsi_qp = qp;
@@ -2301,7 +2356,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 	qp_attr->pkey_index = qplib_qp->pkey_index;
 	qp_attr->qkey = qplib_qp->qkey;
 	qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
-	rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->ah.flow_label,
+	rdma_ah_set_grh(&qp_attr->ah_attr, NULL, qplib_qp->udp_sport,
 			qplib_qp->ah.host_sgid_index,
 			qplib_qp->ah.hop_limit,
 			qplib_qp->ah.traffic_class);
@@ -3248,9 +3303,9 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 				      IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(cq->resize_umem)) {
 		rc = PTR_ERR(cq->resize_umem);
+		ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %pe\n",
+			  __func__, cq->resize_umem);
 		cq->resize_umem = NULL;
-		ibdev_err(&rdev->ibdev, "%s: ib_umem_get failed! rc = %d\n",
-			  __func__, rc);
 		goto fail;
 	}
 	cq->resize_cqe = entries;
@@ -4392,6 +4447,93 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx)
 	}
 }
 
+static int bnxt_re_setup_vnic(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp)
+{
+	int rc;
+
+	rc = bnxt_re_hwrm_alloc_vnic(rdev);
+	if (rc)
+		return rc;
+
+	rc = bnxt_re_hwrm_cfg_vnic(rdev, qp->qplib_qp.id);
+	if (rc)
+		goto out_free_vnic;
+
+	return 0;
+out_free_vnic:
+	bnxt_re_hwrm_free_vnic(rdev);
+	return rc;
+}
+
+struct ib_flow *bnxt_re_create_flow(struct ib_qp *ib_qp,
+				    struct ib_flow_attr *attr,
+				    struct ib_udata *udata)
+{
+	struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
+	struct bnxt_re_dev *rdev = qp->rdev;
+	struct bnxt_re_flow *flow;
+	int rc;
+
+	if (attr->type != IB_FLOW_ATTR_SNIFFER ||
+	    !rdev->rcfw.roce_mirror)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mutex_lock(&rdev->qp_lock);
+	if (rdev->sniffer_flow_created) {
+		ibdev_err(&rdev->ibdev, "RoCE Mirroring is already Configured\n");
+		mutex_unlock(&rdev->qp_lock);
+		return ERR_PTR(-EBUSY);
+	}
+
+	flow = kzalloc(sizeof(*flow), GFP_KERNEL);
+	if (!flow) {
+		mutex_unlock(&rdev->qp_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	flow->rdev = rdev;
+
+	rc = bnxt_re_setup_vnic(rdev, qp);
+	if (rc)
+		goto out_free_flow;
+
+	rc = bnxt_qplib_create_flow(&rdev->qplib_res);
+	if (rc)
+		goto out_free_vnic;
+
+	rdev->sniffer_flow_created = 1;
+	mutex_unlock(&rdev->qp_lock);
+
+	return &flow->ib_flow;
+
+out_free_vnic:
+	bnxt_re_hwrm_free_vnic(rdev);
+out_free_flow:
+	mutex_unlock(&rdev->qp_lock);
+	kfree(flow);
+	return ERR_PTR(rc);
+}
+
+int bnxt_re_destroy_flow(struct ib_flow *flow_id)
+{
+	struct bnxt_re_flow *flow =
+		container_of(flow_id, struct bnxt_re_flow, ib_flow);
+	struct bnxt_re_dev *rdev = flow->rdev;
+	int rc;
+
+	mutex_lock(&rdev->qp_lock);
+	rc = bnxt_qplib_destroy_flow(&rdev->qplib_res);
+	if (rc)
+		ibdev_dbg(&rdev->ibdev, "failed to destroy_flow rc = %d\n", rc);
+	rdev->sniffer_flow_created = 0;
+
+	bnxt_re_hwrm_free_vnic(rdev);
+	mutex_unlock(&rdev->qp_lock);
+	kfree(flow);
+
+	return rc;
+}
+
 static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq_id)
 {
 	struct bnxt_re_cq *cq = NULL, *tmp_cq;
@@ -4604,7 +4746,7 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_ALLOC_PAGE)(struct uverbs_attr_bundle *
 		return err;
 
 	err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_DPI,
-			     &dpi, sizeof(length));
+			     &dpi, sizeof(dpi));
 	if (err)
 		return err;
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index fe00ab691a51..76ba9ab04d5c 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -164,6 +164,11 @@ struct bnxt_re_user_mmap_entry {
 	u8 mmap_flag;
 };
 
+struct bnxt_re_flow {
+	struct ib_flow		ib_flow;
+	struct bnxt_re_dev	*rdev;
+};
+
 static inline u16 bnxt_re_get_swqe_size(int nsge)
 {
 	return sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge);
@@ -267,6 +272,11 @@ struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 					 struct uverbs_attr_bundle *attrs);
 int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
 void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
+struct ib_flow *bnxt_re_create_flow(struct ib_qp *ib_qp,
+				    struct ib_flow_attr *attr,
+				    struct ib_udata *udata);
+int bnxt_re_destroy_flow(struct ib_flow *flow_id);
+
 int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
 
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index df7cf8d68e27..b13810572c2e 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -80,6 +80,7 @@ MODULE_LICENSE("Dual BSD/GPL");
 static DEFINE_MUTEX(bnxt_re_mutex);
 
 static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev);
+static int bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev);
 
 static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
 			     u32 *offset);
@@ -188,6 +189,10 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
 	rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev);
 	rdev->qplib_res.en_dev = en_dev;
 
+	rc = bnxt_re_query_hwrm_intf_version(rdev);
+	if (rc)
+		goto free_dev_attr;
+
 	bnxt_re_set_drv_mode(rdev);
 
 	bnxt_re_set_db_offset(rdev);
@@ -540,6 +545,72 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg,
 	fw_msg->timeout = timeout;
 }
 
+void bnxt_re_hwrm_free_vnic(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_vnic_free_input req = {};
+	struct bnxt_fw_msg fw_msg = {};
+	int rc;
+
+	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_FREE);
+
+	req.vnic_id = cpu_to_le32(rdev->mirror_vnic_id);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), NULL,
+			    0, BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
+	rc = bnxt_send_msg(en_dev, &fw_msg);
+	if (rc)
+		ibdev_dbg(&rdev->ibdev,
+			  "Failed to free vnic, rc = %d\n", rc);
+}
+
+int bnxt_re_hwrm_alloc_vnic(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_vnic_alloc_output resp = {};
+	struct hwrm_vnic_alloc_input req = {};
+	struct bnxt_fw_msg fw_msg = {};
+	int rc;
+
+	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_ALLOC);
+
+	req.vnic_id = cpu_to_le16(rdev->mirror_vnic_id);
+	req.flags = cpu_to_le32(VNIC_ALLOC_REQ_FLAGS_VNIC_ID_VALID);
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
+	rc = bnxt_send_msg(en_dev, &fw_msg);
+	if (rc)
+		ibdev_dbg(&rdev->ibdev,
+			  "Failed to alloc vnic, rc = %d\n", rc);
+
+	return rc;
+}
+
+int bnxt_re_hwrm_cfg_vnic(struct bnxt_re_dev *rdev, u32 qp_id)
+{
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
+	struct hwrm_vnic_cfg_input req = {};
+	struct bnxt_fw_msg fw_msg = {};
+	int rc;
+
+	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VNIC_CFG);
+
+	req.flags = cpu_to_le32(VNIC_CFG_REQ_FLAGS_ROCE_ONLY_VNIC_MODE);
+	req.enables = cpu_to_le32(VNIC_CFG_REQ_ENABLES_RAW_QP_ID |
+				  VNIC_CFG_REQ_ENABLES_MRU);
+	req.vnic_id = cpu_to_le16(rdev->mirror_vnic_id);
+	req.raw_qp_id = cpu_to_le32(qp_id);
+	req.mru = cpu_to_le16(rdev->netdev->mtu + VLAN_ETH_HLEN);
+
+	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), NULL,
+			    0, BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
+	rc = bnxt_send_msg(en_dev, &fw_msg);
+	if (rc)
+		ibdev_dbg(&rdev->ibdev,
+			  "Failed to cfg vnic, rc = %d\n", rc);
+
+	return rc;
+}
+
 /* Query device config using common hwrm */
 static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
 			     u32 *offset)
@@ -553,11 +624,12 @@ static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len,
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCFG);
 	req.fid = cpu_to_le16(0xffff);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (!rc) {
 		*db_len = PAGE_ALIGN(le16_to_cpu(resp.l2_doorbell_bar_size_kb) * 1024);
 		*offset = PAGE_ALIGN(le16_to_cpu(resp.legacy_l2_db_size_kb) * 1024);
+		rdev->mirror_vnic_id = le16_to_cpu(resp.mirror_vnic_id);
 	}
 	return rc;
 }
@@ -577,7 +649,7 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev)
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCAPS);
 	req.fid = cpu_to_le16(0xffff);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (rc)
@@ -587,6 +659,8 @@ int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev)
 	flags_ext2 = le32_to_cpu(resp.flags_ext2);
 	cctx->modes.dbr_pacing = flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED ||
 				 flags_ext2 & FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED;
+	cctx->modes.roce_mirror = !!(le32_to_cpu(resp.flags_ext3) &
+				     FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED);
 	return 0;
 }
 
@@ -603,7 +677,7 @@ static int bnxt_re_hwrm_dbr_pacing_qcfg(struct bnxt_re_dev *rdev)
 	cctx = rdev->chip_ctx;
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_DBR_PACING_QCFG);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (rc)
 		return rc;
@@ -842,20 +916,12 @@ static void bnxt_re_deinitialize_dbr_pacing(struct bnxt_re_dev *rdev)
 static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
 				 u16 fw_ring_id, int type)
 {
-	struct bnxt_en_dev *en_dev;
+	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ring_free_input req = {};
 	struct hwrm_ring_free_output resp;
 	struct bnxt_fw_msg fw_msg = {};
 	int rc = -EINVAL;
 
-	if (!rdev)
-		return rc;
-
-	en_dev = rdev->en_dev;
-
-	if (!en_dev)
-		return rc;
-
 	if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
 		return 0;
 
@@ -863,7 +929,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
 	req.ring_type = type;
 	req.ring_id = cpu_to_le16(fw_ring_id);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (rc)
 		ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
@@ -881,9 +947,6 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev,
 	struct bnxt_fw_msg fw_msg = {};
 	int rc = -EINVAL;
 
-	if (!en_dev)
-		return rc;
-
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC);
 	req.enables = 0;
 	req.page_tbl_addr =  cpu_to_le64(ring_attr->dma_arr[0]);
@@ -899,7 +962,7 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev,
 	req.ring_type = ring_attr->type;
 	req.int_mode = ring_attr->mode;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (!rc)
 		*fw_ring_id = le16_to_cpu(resp.ring_id);
@@ -916,16 +979,13 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
 	struct bnxt_fw_msg fw_msg = {};
 	int rc = -EINVAL;
 
-	if (!en_dev)
-		return rc;
-
 	if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags))
 		return 0;
 
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE);
 	req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id);
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (rc)
 		ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
@@ -935,8 +995,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
 }
 
 static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
-				       dma_addr_t dma_map,
-				       u32 *fw_stats_ctx_id)
+				       struct bnxt_qplib_stats *stats)
 {
 	struct bnxt_qplib_chip_ctx *chip_ctx = rdev->chip_ctx;
 	struct hwrm_stat_ctx_alloc_output resp = {};
@@ -945,21 +1004,18 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev,
 	struct bnxt_fw_msg fw_msg = {};
 	int rc = -EINVAL;
 
-	*fw_stats_ctx_id = INVALID_STATS_CTX_ID;
-
-	if (!en_dev)
-		return rc;
+	stats->fw_id = INVALID_STATS_CTX_ID;
 
 	bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC);
 	req.update_period_ms = cpu_to_le32(1000);
-	req.stats_dma_addr = cpu_to_le64(dma_map);
+	req.stats_dma_addr = cpu_to_le64(stats->dma_map);
 	req.stats_dma_length = cpu_to_le16(chip_ctx->hw_stats_size);
 	req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
-			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
+			    sizeof(resp), BNXT_RE_HWRM_CMD_TIMEOUT(rdev));
 	rc = bnxt_send_msg(en_dev, &fw_msg);
 	if (!rc)
-		*fw_stats_ctx_id = le32_to_cpu(resp.stat_ctx_id);
+		stats->fw_id = le32_to_cpu(resp.stat_ctx_id);
 
 	return rc;
 }
@@ -975,7 +1031,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
 	struct bnxt_re_dev *rdev =
 		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
 
-	return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->vendor);
+	return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->revision);
 }
 static DEVICE_ATTR_RO(hw_rev);
 
@@ -985,13 +1041,31 @@ static ssize_t hca_type_show(struct device *device,
 	struct bnxt_re_dev *rdev =
 		rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev);
 
-	return sysfs_emit(buf, "%s\n", rdev->ibdev.node_desc);
+	return sysfs_emit(buf, "0x%x\n", rdev->en_dev->pdev->device);
 }
 static DEVICE_ATTR_RO(hca_type);
 
+static ssize_t board_id_show(struct device *device, struct device_attribute *attr,
+			     char *buf)
+{
+	struct bnxt_re_dev *rdev = rdma_device_to_drv_device(device,
+							     struct bnxt_re_dev, ibdev);
+	char buffer[BNXT_VPD_FLD_LEN] = {};
+
+	if (!rdev->is_virtfn)
+		memcpy(buffer, rdev->board_partno, BNXT_VPD_FLD_LEN - 1);
+	else
+		scnprintf(buffer, BNXT_VPD_FLD_LEN, "0x%x-VF",
+			  rdev->en_dev->pdev->device);
+
+	return sysfs_emit(buf, "%s\n", buffer);
+}
+static DEVICE_ATTR_RO(board_id);
+
 static struct attribute *bnxt_re_attributes[] = {
 	&dev_attr_hw_rev.attr,
 	&dev_attr_hca_type.attr,
+	&dev_attr_board_id.attr,
 	NULL
 };
 
@@ -1207,6 +1281,8 @@ static int bnxt_re_fill_res_srq_entry(struct sk_buff *msg, struct ib_srq *ib_srq
 		goto err;
 	if (rdma_nl_put_driver_u32_hex(msg, "max_sge", srq->qplib_srq.max_sge))
 		goto err;
+	if (rdma_nl_put_driver_u32_hex(msg, "srq_limit", srq->qplib_srq.threshold))
+		goto err;
 
 	nla_nest_end(msg, table_attr);
 	return 0;
@@ -1297,6 +1373,8 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.reg_user_mr_dmabuf = bnxt_re_reg_user_mr_dmabuf,
 	.req_notify_cq = bnxt_re_req_notify_cq,
 	.resize_cq = bnxt_re_resize_cq,
+	.create_flow = bnxt_re_create_flow,
+	.destroy_flow = bnxt_re_destroy_flow,
 	INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
 	INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
@@ -1323,8 +1401,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 
 	/* ib device init */
 	ibdev->node_type = RDMA_NODE_IB_CA;
-	strscpy(ibdev->node_desc, BNXT_RE_DESC " HCA",
-		strlen(BNXT_RE_DESC) + 5);
+	strscpy(ibdev->node_desc, BNXT_RE_DESC " HCA");
 	ibdev->phys_port_cnt = 1;
 
 	addrconf_addr_eui48((u8 *)&ibdev->node_guid, rdev->netdev->dev_addr);
@@ -1850,81 +1927,6 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
 	mutex_unlock(&rdev->qp_lock);
 }
 
-static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
-{
-	struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
-	struct bnxt_qplib_gid gid;
-	u16 gid_idx, index;
-	int rc = 0;
-
-	if (!ib_device_try_get(&rdev->ibdev))
-		return 0;
-
-	for (index = 0; index < sgid_tbl->active; index++) {
-		gid_idx = sgid_tbl->hw_id[index];
-
-		if (!memcmp(&sgid_tbl->tbl[index], &bnxt_qplib_gid_zero,
-			    sizeof(bnxt_qplib_gid_zero)))
-			continue;
-		/* need to modify the VLAN enable setting of non VLAN GID only
-		 * as setting is done for VLAN GID while adding GID
-		 */
-		if (sgid_tbl->vlan[index])
-			continue;
-
-		memcpy(&gid, &sgid_tbl->tbl[index], sizeof(gid));
-
-		rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx,
-					    rdev->qplib_res.netdev->dev_addr);
-	}
-
-	ib_device_put(&rdev->ibdev);
-	return rc;
-}
-
-static u32 bnxt_re_get_priority_mask(struct bnxt_re_dev *rdev)
-{
-	u32 prio_map = 0, tmp_map = 0;
-	struct net_device *netdev;
-	struct dcb_app app = {};
-
-	netdev = rdev->netdev;
-
-	app.selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE;
-	app.protocol = ETH_P_IBOE;
-	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
-	prio_map = tmp_map;
-
-	app.selector = IEEE_8021QAZ_APP_SEL_DGRAM;
-	app.protocol = ROCE_V2_UDP_DPORT;
-	tmp_map = dcb_ieee_getapp_mask(netdev, &app);
-	prio_map |= tmp_map;
-
-	return prio_map;
-}
-
-static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
-{
-	u8 prio_map = 0;
-
-	/* Get priority for roce */
-	prio_map = bnxt_re_get_priority_mask(rdev);
-
-	if (prio_map == rdev->cur_prio_map)
-		return 0;
-	rdev->cur_prio_map = prio_map;
-	/* Actual priorities are not programmed as they are already
-	 * done by L2 driver; just enable or disable priority vlan tagging
-	 */
-	if ((prio_map == 0 && rdev->qplib_res.prio) ||
-	    (prio_map != 0 && !rdev->qplib_res.prio)) {
-		rdev->qplib_res.prio = prio_map;
-		bnxt_re_update_gid(rdev);
-	}
-
-	return 0;
-}
-
 static void bnxt_re_net_unregister_async_event(struct bnxt_re_dev *rdev)
 {
 	if (rdev->is_virtfn)
@@ -1945,7 +1947,31 @@ static void bnxt_re_net_register_async_event(struct bnxt_re_dev *rdev)
 				   ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE);
 }
 
-static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
+static void bnxt_re_read_vpd_info(struct bnxt_re_dev *rdev)
+{
+	struct pci_dev *pdev = rdev->en_dev->pdev;
+	unsigned int vpd_size, kw_len;
+	int pos, size;
+	u8 *vpd_data;
+
+	vpd_data = pci_vpd_alloc(pdev, &vpd_size);
+	if (IS_ERR(vpd_data)) {
+		pci_warn(pdev, "Unable to read VPD, err=%pe\n", vpd_data);
+		return;
+	}
+
+	pos = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size,
+					   PCI_VPD_RO_KEYWORD_PARTNO, &kw_len);
+	if (pos < 0)
+		goto free;
+
+	size = min_t(int, kw_len, BNXT_VPD_FLD_LEN - 1);
+	memcpy(rdev->board_partno, &vpd_data[pos], size);
+free:
+	kfree(vpd_data);
+}
+
+static int bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ver_get_output resp = {};
@@ -1964,7 +1990,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 	if (rc) {
 		ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
 			  rc);
-		return;
+		return rc;
 	}
 
 	cctx = rdev->chip_ctx;
@@ -1978,6 +2004,8 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 
 	if (!cctx->hwrm_cmd_max_timeout)
 		cctx->hwrm_cmd_max_timeout = RCFW_FW_STALL_MAX_TIMEOUT;
+
+	return 0;
 }
 
 static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
@@ -2039,6 +2067,72 @@ static void bnxt_re_free_gid_ctx(struct bnxt_re_dev *rdev)
 	}
 }
 
+static int bnxt_re_get_stats_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx;
+	struct bnxt_qplib_res *res = &rdev->qplib_res;
+	int rc;
+
+	rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &hctx->stats);
+	if (rc)
+		return rc;
+
+	rc = bnxt_re_net_stats_ctx_alloc(rdev, &hctx->stats);
+	if (rc)
+		goto free_stat_mem;
+
+	return 0;
+free_stat_mem:
+	bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats);
+
+	return rc;
+}
+
+static int bnxt_re_get_stats3_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx;
+	struct bnxt_qplib_res *res = &rdev->qplib_res;
+	int rc;
+
+	if (!rdev->rcfw.roce_mirror)
+		return 0;
+
+	rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &hctx->stats3);
+	if (rc)
+		return rc;
+
+	rc = bnxt_re_net_stats_ctx_alloc(rdev, &hctx->stats3);
+	if (rc)
+		goto free_stat_mem;
+
+	return 0;
+free_stat_mem:
+	bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats3);
+
+	return rc;
+}
+
+static void bnxt_re_put_stats3_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx;
+	struct bnxt_qplib_res *res = &rdev->qplib_res;
+
+	if (!rdev->rcfw.roce_mirror)
+		return;
+
+	bnxt_re_net_stats_ctx_free(rdev, hctx->stats3.fw_id);
+	bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats3);
+}
+
+static void bnxt_re_put_stats_ctx(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_ctx *hctx = &rdev->qplib_ctx;
+	struct bnxt_qplib_res *res = &rdev->qplib_res;
+
+	bnxt_re_net_stats_ctx_free(rdev, hctx->stats.fw_id);
+	bnxt_qplib_free_stats_ctx(res->pdev, &hctx->stats);
+}
+
 static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 {
 	u8 type;
@@ -2049,8 +2143,7 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 	bnxt_re_net_unregister_async_event(rdev);
 	bnxt_re_uninit_dcb_wq(rdev);
 
-	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
-		cancel_delayed_work_sync(&rdev->worker);
+	bnxt_re_put_stats3_ctx(rdev);
 
 	bnxt_re_free_gid_ctx(rdev);
 	if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED,
@@ -2064,8 +2157,8 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 		if (rc)
 			ibdev_warn(&rdev->ibdev,
 				   "Failed to deinitialize RCFW: %#x", rc);
-		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
-		bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
+		bnxt_re_put_stats_ctx(rdev);
+		bnxt_qplib_free_hwctx(&rdev->qplib_res, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
@@ -2085,16 +2178,6 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type)
 	}
 }
 
-/* worker thread for polling periodic events. Now used for QoS programming*/
-static void bnxt_re_worker(struct work_struct *work)
-{
-	struct bnxt_re_dev *rdev = container_of(work, struct bnxt_re_dev,
-						worker.work);
-
-	bnxt_re_setup_qos(rdev);
-	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
-}
-
 static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 {
 	struct bnxt_re_ring_attr rattr = {};
@@ -2109,8 +2192,9 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 		rc = bnxt_re_register_netdev(rdev);
 		if (rc) {
 			ibdev_err(&rdev->ibdev,
-				  "Failed to register with netedev: %#x\n", rc);
-			return -EINVAL;
+				  "Failed to register with Ethernet driver, rc %d\n",
+				  rc);
+			return rc;
 		}
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
@@ -2148,8 +2232,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 	/* Check whether VF or PF */
 	bnxt_re_get_sriov_func_type(rdev);
 
-	bnxt_re_query_hwrm_intf_version(rdev);
-
 	/* Establish RCFW Communication Channel to initialize the context
 	 * memory for the function and all child VFs
 	 */
@@ -2199,18 +2281,20 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 	if (rc)
 		goto disable_rcfw;
 
+	bnxt_qplib_query_version(&rdev->rcfw);
 	bnxt_re_set_resource_limits(rdev);
 
-	rc = bnxt_qplib_alloc_ctx(&rdev->qplib_res, &rdev->qplib_ctx, 0,
-				  bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx));
-	if (rc) {
-		ibdev_err(&rdev->ibdev,
-			  "Failed to allocate QPLIB context: %#x\n", rc);
-		goto disable_rcfw;
+	if (!rdev->is_virtfn &&
+	    !bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) {
+		rc = bnxt_qplib_alloc_hwctx(&rdev->qplib_res, &rdev->qplib_ctx);
+		if (rc) {
+			ibdev_err(&rdev->ibdev,
+				  "Failed to allocate hw context: %#x\n", rc);
+			goto disable_rcfw;
+		}
 	}
-	rc = bnxt_re_net_stats_ctx_alloc(rdev,
-					 rdev->qplib_ctx.stats.dma_map,
-					 &rdev->qplib_ctx.stats.fw_id);
+
+	rc = bnxt_re_get_stats_ctx(rdev);
 	if (rc) {
 		ibdev_err(&rdev->ibdev,
 			  "Failed to allocate stats context: %#x\n", rc);
@@ -2249,15 +2333,6 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 		if (rc)
 			ibdev_warn(&rdev->ibdev, "Failed to query CC defaults\n");
 
-		rc = bnxt_re_setup_qos(rdev);
-		if (rc)
-			ibdev_info(&rdev->ibdev,
-				   "RoCE priority not yet configured\n");
-
-		INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
-		set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
-		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
-
 		if (!(rdev->qplib_res.en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT))
 			bnxt_re_vf_res_config(rdev);
 	}
@@ -2270,11 +2345,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type)
 	bnxt_re_init_dcb_wq(rdev);
 	bnxt_re_net_register_async_event(rdev);
 
+	if (!rdev->is_virtfn)
+		bnxt_re_read_vpd_info(rdev);
+
+	rc = bnxt_re_get_stats3_ctx(rdev);
+	if (rc)
+		goto fail;
+
 	return 0;
 free_sctx:
 	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 free_ctx:
-	bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
+	bnxt_qplib_free_hwctx(&rdev->qplib_res, &rdev->qplib_ctx);
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index ee36b3d82cc0..ce90d3d834d4 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -1307,6 +1307,7 @@ static bool is_optimized_state_transition(struct bnxt_qplib_qp *qp)
 
 int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
+	struct bnxt_qplib_sgid_tbl *sgid_tbl = &res->sgid_tbl;
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct creq_modify_qp_resp resp = {};
 	struct bnxt_qplib_cmdqmsg msg = {};
@@ -1358,9 +1359,14 @@ int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL)
 		req.flow_label = cpu_to_le32(qp->ah.flow_label);
 
-	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX)
-		req.sgid_index = cpu_to_le16(res->sgid_tbl.hw_id
-					     [qp->ah.sgid_index]);
+	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_SGID_INDEX) {
+		if (qp->type == CMDQ_CREATE_QP_TYPE_RAW_ETHERTYPE)
+			req.sgid_index =
+				cpu_to_le16(sgid_tbl->hw_id[qp->ugid_index]);
+		else
+			req.sgid_index =
+				cpu_to_le16(sgid_tbl->hw_id[qp->ah.sgid_index]);
+	}
 
 	if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_HOP_LIMIT)
 		req.hop_limit = qp->ah.hop_limit;
@@ -1464,6 +1470,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 	qp->access = sb->access;
 	qp->pkey_index = le16_to_cpu(sb->pkey);
 	qp->qkey = le32_to_cpu(sb->qkey);
+	qp->udp_sport = le16_to_cpu(sb->udp_src_port);
 
 	temp32[0] = le32_to_cpu(sb->dgid[0]);
 	temp32[1] = le32_to_cpu(sb->dgid[1]);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 4921a214c34c..b990d0c0ce1a 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -299,6 +299,7 @@ struct bnxt_qplib_qp {
 	u8				smac[6];
 	u16				vlan_id;
 	u16				port_id;
+	u16				udp_sport;
 	u8				nw_type;
 	struct bnxt_qplib_ah		ah;
 
@@ -344,6 +345,7 @@ struct bnxt_qplib_qp {
 	u32				msn_tbl_sz;
 	bool				is_host_msn_tbl;
 	u8				tos_dscp;
+	u32				ugid_index;
 };
 
 #define BNXT_RE_MAX_MSG_SIZE	0x80000000
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 804bc773b4ef..295a9610f3e6 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -186,7 +186,7 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
  * wait for command completion. Maximum holding interval is 8 second.
  *
  * Returns:
- * -ETIMEOUT if command is not completed in specific time interval.
+ * -ETIMEDOUT if command is not completed in specific time interval.
  * 0 if command is completed by firmware.
  */
 static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
@@ -366,6 +366,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw,
 	wmb();
 	writel(cmdq_prod, cmdq->cmdq_mbox.prod);
 	writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
+	print_hex_dump_bytes("req: ", DUMP_PREFIX_OFFSET, msg->req, msg->req_sz);
 	spin_unlock_bh(&hwq->lock);
 	/* Return the CREQ response pointer */
 	return 0;
@@ -381,7 +382,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw,
  * This function can not be called from non-sleepable context.
  *
  * Returns:
- * -ETIMEOUT if command is not completed in specific time interval.
+ * -ETIMEDOUT if command is not completed in specific time interval.
  * 0 if command is completed by firmware.
  */
 static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
@@ -631,6 +632,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 	int rc = 0;
 
 	pdev = rcfw->pdev;
+	print_hex_dump_bytes("event: ", DUMP_PREFIX_OFFSET, qp_event, sizeof(*qp_event));
 	switch (qp_event->event) {
 	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
 		err_event = (struct creq_qp_error_notification *)qp_event;
@@ -903,6 +905,10 @@ skip_ctx_setup:
 		flags |= CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED;
 	if (rcfw->res->en_dev->flags & BNXT_EN_FLAG_ROCE_VF_RES_MGMT)
 		flags |= CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT;
+	if (bnxt_qplib_roce_mirror_supported(rcfw->res->cctx)) {
+		flags |= CMDQ_INITIALIZE_FW_FLAGS_MIRROR_ON_ROCE_SUPPORTED;
+		rcfw->roce_mirror = true;
+	}
 	req.flags |= cpu_to_le16(flags);
 	req.stat_ctx_id = cpu_to_le32(ctx->stats.fw_id);
 	bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), 0);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index ff873c5f1b25..988c89b4232e 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -236,6 +236,7 @@ struct bnxt_qplib_rcfw {
 	atomic_t timeout_send;
 	/* cached from chip cctx for quick reference in slow path */
 	u16 max_timeout;
+	bool roce_mirror;
 };
 
 struct bnxt_qplib_cmdqmsg {
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index cc5c82d96839..875d7b52c06a 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -53,12 +53,6 @@
 #include "qplib_sp.h"
 #include "qplib_rcfw.h"
 
-static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
-				      struct bnxt_qplib_stats *stats);
-static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
-				      struct bnxt_qplib_chip_ctx *cctx,
-				      struct bnxt_qplib_stats *stats);
-
 /* PBL */
 static void __free_pbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pbl *pbl,
 		       bool is_umem)
@@ -352,8 +346,8 @@ fail:
 }
 
 /* Context Tables */
-void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
-			 struct bnxt_qplib_ctx *ctx)
+void bnxt_qplib_free_hwctx(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_ctx *ctx)
 {
 	int i;
 
@@ -367,7 +361,6 @@ void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 	/* restore original pde level before destroy */
 	ctx->tqm_ctx.pde.level = ctx->tqm_ctx.pde_level;
 	bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.pde);
-	bnxt_qplib_free_stats_ctx(res->pdev, &ctx->stats);
 }
 
 static int bnxt_qplib_alloc_tqm_rings(struct bnxt_qplib_res *res,
@@ -466,7 +459,7 @@ fail:
 }
 
 /*
- * Routine: bnxt_qplib_alloc_ctx
+ * Routine: bnxt_qplib_alloc_hwctx
  * Description:
  *     Context tables are memories which are used by the chip fw.
  *     The 6 tables defined are:
@@ -486,17 +479,13 @@ fail:
  * Returns:
  *     0 if success, else -ERRORS
  */
-int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
-			 struct bnxt_qplib_ctx *ctx,
-			 bool virt_fn, bool is_p5)
+int bnxt_qplib_alloc_hwctx(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_ctx *ctx)
 {
 	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct bnxt_qplib_sg_info sginfo = {};
 	int rc;
 
-	if (virt_fn || is_p5)
-		goto stats_alloc;
-
 	/* QPC Tables */
 	sginfo.pgsize = PAGE_SIZE;
 	sginfo.pgshft = PAGE_SHIFT;
@@ -542,16 +531,11 @@ int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 	rc = bnxt_qplib_alloc_init_hwq(&ctx->tim_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
-stats_alloc:
-	/* Stats */
-	rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &ctx->stats);
-	if (rc)
-		goto fail;
 
 	return 0;
 
 fail:
-	bnxt_qplib_free_ctx(res, ctx);
+	bnxt_qplib_free_hwctx(res, ctx);
 	return rc;
 }
 
@@ -832,8 +816,8 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res,
 }
 
 /* Stats */
-static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
-				      struct bnxt_qplib_stats *stats)
+void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+			       struct bnxt_qplib_stats *stats)
 {
 	if (stats->dma) {
 		dma_free_coherent(&pdev->dev, stats->size,
@@ -843,9 +827,9 @@ static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
 	stats->fw_id = -1;
 }
 
-static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
-				      struct bnxt_qplib_chip_ctx *cctx,
-				      struct bnxt_qplib_stats *stats)
+int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+			       struct bnxt_qplib_chip_ctx *cctx,
+			       struct bnxt_qplib_stats *stats)
 {
 	memset(stats, 0, sizeof(*stats));
 	stats->fw_id = -1;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 6a13927674b4..2ea3b7f232a3 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -65,6 +65,7 @@ struct bnxt_qplib_drv_modes {
 	bool db_push;
 	bool dbr_pacing;
 	u32 toggle_bits;
+	u8 roce_mirror;
 };
 
 enum bnxt_re_toggle_modes {
@@ -303,6 +304,7 @@ struct bnxt_qplib_ctx {
 	struct bnxt_qplib_hwq		tim_tbl;
 	struct bnxt_qplib_tqm_ctx	tqm_ctx;
 	struct bnxt_qplib_stats		stats;
+	struct bnxt_qplib_stats		stats3;
 	struct bnxt_qplib_vf_res	vf_res;
 };
 
@@ -432,15 +434,19 @@ void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res);
 int bnxt_qplib_init_res(struct bnxt_qplib_res *res);
 void bnxt_qplib_free_res(struct bnxt_qplib_res *res);
 int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev);
-void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
-			 struct bnxt_qplib_ctx *ctx);
-int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
-			 struct bnxt_qplib_ctx *ctx,
-			 bool virt_fn, bool is_p5);
+void bnxt_qplib_free_hwctx(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_ctx *ctx);
+int bnxt_qplib_alloc_hwctx(struct bnxt_qplib_res *res,
+			   struct bnxt_qplib_ctx *ctx);
 int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res);
 void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res);
 
 int bnxt_qplib_determine_atomics(struct pci_dev *dev);
+int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+			       struct bnxt_qplib_chip_ctx *cctx,
+			       struct bnxt_qplib_stats *stats);
+void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
+			       struct bnxt_qplib_stats *stats);
 
 static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_db_info *dbinfo,
 					    struct bnxt_qplib_hwq *hwq, u32 cnt)
@@ -582,6 +588,11 @@ static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx)
 	return cctx->modes.dbr_pacing;
 }
 
+static inline u8 bnxt_qplib_roce_mirror_supported(struct bnxt_qplib_chip_ctx *cctx)
+{
+	return cctx->modes.roce_mirror;
+}
+
 static inline bool _is_alloc_mr_unified(u16 dev_cap_flags)
 {
 	return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC;
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 68981399598d..9ef581ed785c 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -66,14 +66,15 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw)
 	return (pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ);
 }
 
-static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw,
-				     char *fw_ver)
+void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw)
 {
 	struct creq_query_version_resp resp = {};
 	struct bnxt_qplib_cmdqmsg msg = {};
 	struct cmdq_query_version req = {};
+	struct bnxt_qplib_dev_attr *attr;
 	int rc;
 
+	attr = rcfw->res->dattr;
 	bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
 				 CMDQ_BASE_OPCODE_QUERY_VERSION,
 				 sizeof(req));
@@ -82,10 +83,10 @@ static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw,
 	rc = bnxt_qplib_rcfw_send_message(rcfw, &msg);
 	if (rc)
 		return;
-	fw_ver[0] = resp.fw_maj;
-	fw_ver[1] = resp.fw_minor;
-	fw_ver[2] = resp.fw_bld;
-	fw_ver[3] = resp.fw_rsvd;
+	attr->fw_ver[0] = resp.fw_maj;
+	attr->fw_ver[1] = resp.fw_minor;
+	attr->fw_ver[2] = resp.fw_bld;
+	attr->fw_ver[3] = resp.fw_rsvd;
 }
 
 int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
@@ -179,8 +180,6 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw)
 	if (_is_max_srq_ext_supported(attr->dev_cap_flags2))
 		attr->max_srq += le16_to_cpu(sb->max_srq_ext);
 
-	bnxt_qplib_query_version(rcfw, attr->fw_ver);
-
 	for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) {
 		temp = le32_to_cpu(sb->tqm_alloc_reqs[i]);
 		tqm_alloc = (u8 *)&temp;
@@ -309,7 +308,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 
 int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			struct bnxt_qplib_gid *gid, const u8 *smac,
-			u16 vlan_id, bool update, u32 *index)
+			u16 vlan_id, bool update, u32 *index,
+			bool is_ugid, u32 stats_ctx_id)
 {
 	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
 						   struct bnxt_qplib_res,
@@ -374,6 +374,9 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 		req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
 		req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
 
+		req.stats_ctx = cpu_to_le16(CMDQ_ADD_GID_STATS_CTX_STATS_CTX_VALID |
+					    (u16)stats_ctx_id);
+
 		bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req),
 					sizeof(resp), 0);
 		rc = bnxt_qplib_rcfw_send_message(rcfw, &msg);
@@ -397,46 +400,6 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 	return 0;
 }
 
-int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
-			   struct bnxt_qplib_gid *gid, u16 gid_idx,
-			   const u8 *smac)
-{
-	struct bnxt_qplib_res *res = to_bnxt_qplib(sgid_tbl,
-						   struct bnxt_qplib_res,
-						   sgid_tbl);
-	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct creq_modify_gid_resp resp = {};
-	struct bnxt_qplib_cmdqmsg msg = {};
-	struct cmdq_modify_gid req = {};
-	int rc;
-
-	bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
-				 CMDQ_BASE_OPCODE_MODIFY_GID,
-				 sizeof(req));
-
-	req.gid[0] = cpu_to_be32(((u32 *)gid->data)[3]);
-	req.gid[1] = cpu_to_be32(((u32 *)gid->data)[2]);
-	req.gid[2] = cpu_to_be32(((u32 *)gid->data)[1]);
-	req.gid[3] = cpu_to_be32(((u32 *)gid->data)[0]);
-	if (res->prio) {
-		req.vlan |= cpu_to_le16
-			(CMDQ_ADD_GID_VLAN_TPID_TPID_8100 |
-			 CMDQ_ADD_GID_VLAN_VLAN_EN);
-	}
-
-	/* MAC in network format */
-	req.src_mac[0] = cpu_to_be16(((u16 *)smac)[0]);
-	req.src_mac[1] = cpu_to_be16(((u16 *)smac)[1]);
-	req.src_mac[2] = cpu_to_be16(((u16 *)smac)[2]);
-
-	req.gid_index = cpu_to_le16(gid_idx);
-
-	bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req),
-				sizeof(resp), 0);
-	rc = bnxt_qplib_rcfw_send_message(rcfw, &msg);
-	return rc;
-}
-
 /* AH */
 int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
 			 bool block)
@@ -1143,3 +1106,40 @@ out:
 	dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr);
 	return rc;
 }
+
+int bnxt_qplib_create_flow(struct bnxt_qplib_res *res)
+{
+	struct creq_roce_mirror_cfg_resp resp = {};
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_roce_mirror_cfg req = {};
+	struct bnxt_qplib_cmdqmsg msg = {};
+
+	bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
+				 CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG,
+				 sizeof(req));
+
+	req.mirror_flags = (u8)CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE;
+
+	bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req),
+				sizeof(resp), 0);
+	return bnxt_qplib_rcfw_send_message(rcfw, &msg);
+}
+
+int bnxt_qplib_destroy_flow(struct bnxt_qplib_res *res)
+{
+	struct creq_roce_mirror_cfg_resp resp = {};
+	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct cmdq_roce_mirror_cfg req = {};
+	struct bnxt_qplib_cmdqmsg msg = {};
+
+	bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
+				 CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG,
+				 sizeof(req));
+
+	req.mirror_flags &= ~((u8)CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE);
+
+	bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req),
+				sizeof(resp), 0);
+
+	return bnxt_qplib_rcfw_send_message(rcfw, &msg);
+}
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 09faf4a1e849..147b5d9c0313 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -323,7 +323,8 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			struct bnxt_qplib_gid *gid, u16 vlan_id, bool update);
 int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			struct bnxt_qplib_gid *gid, const u8 *mac, u16 vlan_id,
-			bool update, u32 *index);
+			bool update, u32 *index,
+			bool is_ugid, u32 stats_ctx_id);
 int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl,
 			   struct bnxt_qplib_gid *gid, u16 gid_idx,
 			   const u8 *smac);
@@ -358,6 +359,9 @@ int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid,
 			    u32 resp_size, void *resp_va);
 int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res,
 			      struct bnxt_qplib_cc_param *cc_param);
+void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw);
+int bnxt_qplib_create_flow(struct bnxt_qplib_res *res);
+int bnxt_qplib_destroy_flow(struct bnxt_qplib_res *res);
 
 #define BNXT_VAR_MAX_WQE       4352
 #define BNXT_VAR_MAX_SLOT_ALIGN 256
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 024845f945ff..99ecd72e72e2 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -144,7 +144,8 @@ struct cmdq_base {
 	#define CMDQ_BASE_OPCODE_MODIFY_CQ              0x90UL
 	#define CMDQ_BASE_OPCODE_QUERY_QP_EXTEND        0x91UL
 	#define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS_EXT   0x92UL
-	#define CMDQ_BASE_OPCODE_LAST                  CMDQ_BASE_OPCODE_QUERY_ROCE_STATS_EXT
+	#define CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG	0x99UL
+	#define CMDQ_BASE_OPCODE_LAST                   CMDQ_BASE_OPCODE_ROCE_MIRROR_CFG
 	u8	cmd_size;
 	__le16	flags;
 	__le16	cookie;
@@ -218,6 +219,7 @@ struct cmdq_initialize_fw {
 	#define CMDQ_INITIALIZE_FW_FLAGS_HW_REQUESTER_RETX_SUPPORTED     0x2UL
 	#define CMDQ_INITIALIZE_FW_FLAGS_OPTIMIZE_MODIFY_QP_SUPPORTED    0x8UL
 	#define CMDQ_INITIALIZE_FW_FLAGS_L2_VF_RESOURCE_MGMT		 0x10UL
+	#define CMDQ_INITIALIZE_FW_FLAGS_MIRROR_ON_ROCE_SUPPORTED        0x80UL
 	__le16	cookie;
 	u8	resp_size;
 	u8	reserved8;
@@ -788,7 +790,8 @@ struct creq_query_qp_resp_sb {
 	#define CREQ_QUERY_QP_RESP_SB_ACCESS_REMOTE_ATOMIC 0x8UL
 	__le16	pkey;
 	__le32	qkey;
-	__le32	reserved32;
+	__le16	udp_src_port;
+	__le16	reserved16;
 	__le32	dgid[4];
 	__le32	flow_label;
 	__le16	sgid_index;
@@ -2108,6 +2111,43 @@ struct creq_query_roce_stats_ext_resp_sb {
 	__le64	dup_req;
 };
 
+/* cmdq_roce_mirror_cfg (size:192b/24B) */
+struct cmdq_roce_mirror_cfg {
+	u8      opcode;
+	#define CMDQ_ROCE_MIRROR_CFG_OPCODE_ROCE_MIRROR_CFG	0x99UL
+	#define CMDQ_ROCE_MIRROR_CFG_OPCODE_LAST		\
+				CMDQ_ROCE_MIRROR_CFG_OPCODE_ROCE_MIRROR_CFG
+	u8      cmd_size;
+	__le16  flags;
+	__le16  cookie;
+	u8      resp_size;
+	u8      reserved8;
+	__le64  resp_addr;
+	u8      mirror_flags;
+	#define CMDQ_ROCE_MIRROR_CFG_MIRROR_ENABLE		0x1UL
+	u8      rsvd[7];
+};
+
+/* creq_roce_mirror_cfg_resp (size:128b/16B) */
+struct creq_roce_mirror_cfg_resp {
+	u8      type;
+	#define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_MASK	0x3fUL
+	#define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_SFT	0
+	#define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_QP_EVENT	0x38UL
+	#define CREQ_ROCE_MIRROR_CFG_RESP_TYPE_LAST	\
+			CREQ_ROCE_MIRROR_CFG_RESP_TYPE_QP_EVENT
+	u8      status;
+	__le16  cookie;
+	__le32  reserved32;
+	u8      v;
+	#define CREQ_ROCE_MIRROR_CFG_RESP_V		0x1UL
+	u8      event;
+	#define CREQ_ROCE_MIRROR_CFG_RESP_EVENT_ROCE_MIRROR_CFG	0x99UL
+	#define CREQ_ROCE_MIRROR_CFG_RESP_EVENT_LAST	\
+			CREQ_ROCE_MIRROR_CFG_RESP_EVENT_ROCE_MIRROR_CFG
+	u8      reserved48[6];
+};
+
 /* cmdq_query_func (size:128b/16B) */
 struct cmdq_query_func {
 	u8	opcode;
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index b67747ae6a68..d892f55febe2 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -1228,9 +1228,8 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
 		if (!ctx->dev) {
 			ctx->dev = c4iw_alloc(&ctx->lldi);
 			if (IS_ERR(ctx->dev)) {
-				pr_err("%s: initialization failed: %ld\n",
-				       pci_name(ctx->lldi.pdev),
-				       PTR_ERR(ctx->dev));
+				pr_err("%s: initialization failed: %pe\n",
+				       pci_name(ctx->lldi.pdev), ctx->dev);
 				ctx->dev = NULL;
 				break;
 			}
diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
index bafd210dd43e..0e979ca10d24 100644
--- a/drivers/infiniband/hw/efa/efa_com.c
+++ b/drivers/infiniband/hw/efa/efa_com.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -30,6 +30,7 @@ struct efa_comp_ctx {
 	struct efa_admin_acq_entry *user_cqe;
 	u32 comp_size;
 	enum efa_cmd_status status;
+	u16 cmd_id;
 	u8 cmd_opcode;
 	u8 occupied;
 };
@@ -333,6 +334,7 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu
 	comp_ctx->comp_size = comp_size_in_bytes;
 	comp_ctx->user_cqe = comp;
 	comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+	comp_ctx->cmd_id = cmd_id;
 
 	reinit_completion(&comp_ctx->wait_event);
 
@@ -557,17 +559,19 @@ static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *com
 		if (comp_ctx->status == EFA_CMD_COMPLETED)
 			ibdev_err_ratelimited(
 				aq->efa_dev,
-				"The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				"The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (id: %d, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
 				efa_com_cmd_str(comp_ctx->cmd_opcode),
 				comp_ctx->cmd_opcode, comp_ctx->status,
-				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+				comp_ctx->cmd_id, aq->sq.pc, aq->sq.cc,
+				aq->cq.cc);
 		else
 			ibdev_err_ratelimited(
 				aq->efa_dev,
-				"The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+				"The device didn't send any completion for admin cmd %s(%d) status %d (id: %d, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
 				efa_com_cmd_str(comp_ctx->cmd_opcode),
 				comp_ctx->cmd_opcode, comp_ctx->status,
-				comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+				comp_ctx->cmd_id, aq->sq.pc, aq->sq.cc,
+				aq->cq.cc);
 
 		clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
 		err = -ETIME;
@@ -631,9 +635,9 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
 	if (IS_ERR(comp_ctx)) {
 		ibdev_err_ratelimited(
 			aq->efa_dev,
-			"Failed to submit command %s (opcode %u) err %ld\n",
+			"Failed to submit command %s (opcode %u) err %pe\n",
 			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
-			cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
+			cmd->aq_common_descriptor.opcode, comp_ctx);
 
 		up(&aq->avail_cmds);
 		atomic64_inc(&aq->stats.cmd_err);
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 886923d5fe50..d9a12681f843 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1788,7 +1788,8 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 						access_flags);
 	if (IS_ERR(umem_dmabuf)) {
 		err = PTR_ERR(umem_dmabuf);
-		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
+		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%pe]\n",
+			  umem_dmabuf);
 		goto err_free;
 	}
 
@@ -1832,7 +1833,8 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		ibdev_dbg(&dev->ibdev,
-			  "Failed to pin and map user space memory[%d]\n", err);
+			  "Failed to pin and map user space memory[%pe]\n",
+			  mr->umem);
 		goto err_free;
 	}
 
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
index fdeec33c71da..109a3f3de911 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.c
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -149,7 +149,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
 			req.phy_addr[0] = mr->mem.mtt->buf_dma;
 			mtt_level = ERDMA_MR_MTT_1LEVEL;
 		} else {
-			req.phy_addr[0] = sg_dma_address(mr->mem.mtt->sglist);
+			req.phy_addr[0] = mr->mem.mtt->dma_addrs[0];
 			mtt_level = mr->mem.mtt->level;
 		}
 	} else if (mr->type != ERDMA_MR_TYPE_DMA) {
@@ -626,18 +626,27 @@ err_free_mtt:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void erdma_destroy_mtt_buf_sg(struct erdma_dev *dev,
-				     struct erdma_mtt *mtt)
+static void erdma_unmap_page_list(struct erdma_dev *dev, dma_addr_t *pg_dma,
+				  u32 npages)
 {
-	dma_unmap_sg(&dev->pdev->dev, mtt->sglist,
-		     DIV_ROUND_UP(mtt->size, PAGE_SIZE), DMA_TO_DEVICE);
-	vfree(mtt->sglist);
+	u32 i;
+
+	for (i = 0; i < npages; i++)
+		dma_unmap_page(&dev->pdev->dev, pg_dma[i], PAGE_SIZE,
+			       DMA_TO_DEVICE);
+}
+
+static void erdma_destroy_mtt_buf_dma_addrs(struct erdma_dev *dev,
+					    struct erdma_mtt *mtt)
+{
+	erdma_unmap_page_list(dev, mtt->dma_addrs, mtt->npages);
+	vfree(mtt->dma_addrs);
 }
 
 static void erdma_destroy_scatter_mtt(struct erdma_dev *dev,
 				      struct erdma_mtt *mtt)
 {
-	erdma_destroy_mtt_buf_sg(dev, mtt);
+	erdma_destroy_mtt_buf_dma_addrs(dev, mtt);
 	vfree(mtt->buf);
 	kfree(mtt);
 }
@@ -645,50 +654,69 @@ static void erdma_destroy_scatter_mtt(struct erdma_dev *dev,
 static void erdma_init_middle_mtt(struct erdma_mtt *mtt,
 				  struct erdma_mtt *low_mtt)
 {
-	struct scatterlist *sg;
-	u32 idx = 0, i;
+	dma_addr_t *pg_addr = mtt->buf;
+	u32 i;
 
-	for_each_sg(low_mtt->sglist, sg, low_mtt->nsg, i)
-		mtt->buf[idx++] = sg_dma_address(sg);
+	for (i = 0; i < low_mtt->npages; i++)
+		pg_addr[i] = low_mtt->dma_addrs[i];
 }
 
-static int erdma_create_mtt_buf_sg(struct erdma_dev *dev, struct erdma_mtt *mtt)
+static u32 vmalloc_to_dma_addrs(struct erdma_dev *dev, dma_addr_t **dma_addrs,
+				void *buf, u64 len)
 {
-	struct scatterlist *sglist;
-	void *buf = mtt->buf;
-	u32 npages, i, nsg;
+	dma_addr_t *pg_dma;
 	struct page *pg;
+	u32 npages, i;
+	void *addr;
 
-	/* Failed if buf is not page aligned */
-	if ((uintptr_t)buf & ~PAGE_MASK)
-		return -EINVAL;
-
-	npages = DIV_ROUND_UP(mtt->size, PAGE_SIZE);
-	sglist = vzalloc(npages * sizeof(*sglist));
-	if (!sglist)
-		return -ENOMEM;
+	npages = (PAGE_ALIGN((u64)buf + len) - PAGE_ALIGN_DOWN((u64)buf)) >>
+		 PAGE_SHIFT;
+	pg_dma = vcalloc(npages, sizeof(*pg_dma));
+	if (!pg_dma)
+		return 0;
 
-	sg_init_table(sglist, npages);
+	addr = buf;
 	for (i = 0; i < npages; i++) {
-		pg = vmalloc_to_page(buf);
+		pg = vmalloc_to_page(addr);
 		if (!pg)
 			goto err;
-		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
-		buf += PAGE_SIZE;
+
+		pg_dma[i] = dma_map_page(&dev->pdev->dev, pg, 0, PAGE_SIZE,
+					 DMA_TO_DEVICE);
+		if (dma_mapping_error(&dev->pdev->dev, pg_dma[i]))
+			goto err;
+
+		addr += PAGE_SIZE;
 	}
 
-	nsg = dma_map_sg(&dev->pdev->dev, sglist, npages, DMA_TO_DEVICE);
-	if (!nsg)
-		goto err;
+	*dma_addrs = pg_dma;
 
-	mtt->sglist = sglist;
-	mtt->nsg = nsg;
+	return npages;
+err:
+	erdma_unmap_page_list(dev, pg_dma, i);
+	vfree(pg_dma);
 
 	return 0;
-err:
-	vfree(sglist);
+}
 
-	return -ENOMEM;
+static int erdma_create_mtt_buf_dma_addrs(struct erdma_dev *dev,
+					  struct erdma_mtt *mtt)
+{
+	dma_addr_t *addrs;
+	u32 npages;
+
+	/* Failed if buf is not page aligned */
+	if ((uintptr_t)mtt->buf & ~PAGE_MASK)
+		return -EINVAL;
+
+	npages = vmalloc_to_dma_addrs(dev, &addrs, mtt->buf, mtt->size);
+	if (!npages)
+		return -ENOMEM;
+
+	mtt->dma_addrs = addrs;
+	mtt->npages = npages;
+
+	return 0;
 }
 
 static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev,
@@ -707,12 +735,12 @@ static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev,
 	if (!mtt->buf)
 		goto err_free_mtt;
 
-	ret = erdma_create_mtt_buf_sg(dev, mtt);
+	ret = erdma_create_mtt_buf_dma_addrs(dev, mtt);
 	if (ret)
 		goto err_free_mtt_buf;
 
-	ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, nsg:%u\n",
-		  mtt->size, mtt->nsg);
+	ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, npages:%u\n",
+		  mtt->size, mtt->npages);
 
 	return mtt;
 
@@ -746,8 +774,8 @@ static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
 	level = 1;
 
 	/* convergence the mtt table. */
-	while (mtt->nsg != 1 && level <= 3) {
-		tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->nsg));
+	while (mtt->npages != 1 && level <= 3) {
+		tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->npages));
 		if (IS_ERR(tmp_mtt)) {
 			ret = PTR_ERR(tmp_mtt);
 			goto err_free_mtt;
@@ -765,7 +793,7 @@ static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
 
 	mtt->level = level;
 	ibdev_dbg(&dev->ibdev, "top mtt: level:%d, dma_addr 0x%llx\n",
-		  mtt->level, mtt->sglist[0].dma_address);
+		  mtt->level, mtt->dma_addrs[0]);
 
 	return mtt;
 err_free_mtt:
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
index ef411b81fbd7..7d8d3fe501d5 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.h
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
@@ -99,8 +99,8 @@ struct erdma_mtt {
 	union {
 		dma_addr_t buf_dma;
 		struct {
-			struct scatterlist *sglist;
-			u32 nsg;
+			dma_addr_t *dma_addrs;
+			u32 npages;
 			u32 level;
 		};
 	};
diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c
index 4250d077b06f..a98a4175e53b 100644
--- a/drivers/infiniband/hw/hfi1/device.c
+++ b/drivers/infiniband/hw/hfi1/device.c
@@ -64,9 +64,9 @@ int hfi1_cdev_init(int minor, const char *name,
 
 	if (IS_ERR(device)) {
 		ret = PTR_ERR(device);
+		pr_err("Could not create device for minor %d, %s (err %pe)\n",
+		       minor, name, device);
 		device = NULL;
-		pr_err("Could not create device for minor %d, %s (err %d)\n",
-			minor, name, -ret);
 		cdev_del(cdev);
 	}
 done:
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 719b7c34e238..5cfa4f8fbf3d 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -990,7 +990,7 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
 	}
 
 	/* Clean up old mappings */
-	for_each_cpu(cpu, cpu_online_mask) {
+	for_each_online_cpu(cpu) {
 		struct sdma_rht_node *rht_node;
 
 		/* Don't cleanup sdes that are set in the new mask */
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index b72625283fcf..9b1aece1b080 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -498,8 +498,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 					ntids, sizeof(*req->tids));
 		if (IS_ERR(tmp)) {
 			ret = PTR_ERR(tmp);
-			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
-				 ntids, ret);
+			SDMA_DBG(req, "Failed to copy %d TIDs (%pe)", ntids,
+				 tmp);
 			goto free_req;
 		}
 		req->tids = tmp;
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 0f037e545520..31cb8699e198 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -594,8 +594,8 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		mtr->umem = ib_umem_get(ibdev, user_addr, total_size,
 					buf_attr->user_access);
 		if (IS_ERR(mtr->umem)) {
-			ibdev_err(ibdev, "failed to get umem, ret = %ld.\n",
-				  PTR_ERR(mtr->umem));
+			ibdev_err(ibdev, "failed to get umem, ret = %pe.\n",
+				  mtr->umem);
 			return -ENOMEM;
 		}
 	} else {
@@ -605,8 +605,8 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 					       !mtr_has_mtt(buf_attr) ?
 					       HNS_ROCE_BUF_DIRECT : 0);
 		if (IS_ERR(mtr->kmem)) {
-			ibdev_err(ibdev, "failed to alloc kmem, ret = %ld.\n",
-				  PTR_ERR(mtr->kmem));
+			ibdev_err(ibdev, "failed to alloc kmem, ret = %pe.\n",
+				  mtr->kmem);
 			return PTR_ERR(mtr->kmem);
 		}
 	}
diff --git a/drivers/infiniband/hw/ionic/Kconfig b/drivers/infiniband/hw/ionic/Kconfig
new file mode 100644
index 000000000000..de6f10e9b6e9
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018-2025, Advanced Micro Devices, Inc.
+
+config INFINIBAND_IONIC
+	tristate "AMD Pensando DSC RDMA/RoCE Support"
+	depends on NETDEVICES && ETHERNET && PCI && INET && IONIC
+	help
+	  This enables RDMA/RoCE support for the AMD Pensando family of
+	  Distributed Services Cards (DSCs).
+
+	  To learn more, visit our website at
+	  <https://www.amd.com/en/products/accelerators/pensando.html>.
+
+	  To compile this driver as a module, choose M here. The module
+	  will be called ionic_rdma.
diff --git a/drivers/infiniband/hw/ionic/Makefile b/drivers/infiniband/hw/ionic/Makefile
new file mode 100644
index 000000000000..957973742820
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y :=  -I $(srctree)/drivers/net/ethernet/pensando/ionic
+
+obj-$(CONFIG_INFINIBAND_IONIC)	+= ionic_rdma.o
+
+ionic_rdma-y :=	\
+	ionic_ibdev.o ionic_lif_cfg.o ionic_queue.o ionic_pgtbl.o ionic_admin.o \
+	ionic_controlpath.o ionic_datapath.o ionic_hw_stats.o
diff --git a/drivers/infiniband/hw/ionic/ionic_admin.c b/drivers/infiniband/hw/ionic/ionic_admin.c
new file mode 100644
index 000000000000..2537aa55d12d
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_admin.c
@@ -0,0 +1,1229 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+
+#include "ionic_fw.h"
+#include "ionic_ibdev.h"
+
+#define IONIC_EQ_COUNT_MIN	4
+#define IONIC_AQ_COUNT_MIN	1
+
+/* not a valid queue position or negative error status */
+#define IONIC_ADMIN_POSTED	0x10000
+
+/* cpu can be held with irq disabled for COUNT * MS  (for create/destroy_ah) */
+#define IONIC_ADMIN_BUSY_RETRY_COUNT	2000
+#define IONIC_ADMIN_BUSY_RETRY_MS	1
+
+/* admin queue will be considered failed if a command takes longer */
+#define IONIC_ADMIN_TIMEOUT	(HZ * 2)
+#define IONIC_ADMIN_WARN	(HZ / 8)
+
+/* will poll for admin cq to tolerate and report from missed event */
+#define IONIC_ADMIN_DELAY	(HZ / 8)
+
+/* work queue for polling the event queue and admin cq */
+struct workqueue_struct *ionic_evt_workq;
+
+static void ionic_admin_timedout(struct ionic_aq *aq)
+{
+	struct ionic_ibdev *dev = aq->dev;
+	unsigned long irqflags;
+	u16 pos;
+
+	spin_lock_irqsave(&aq->lock, irqflags);
+	if (ionic_queue_empty(&aq->q))
+		goto out;
+
+	/* Reset ALL adminq if any one times out */
+	if (atomic_read(&aq->admin_state) < IONIC_ADMIN_KILLED)
+		queue_work(ionic_evt_workq, &dev->reset_work);
+
+	ibdev_err(&dev->ibdev, "admin command timed out, aq %d after: %ums\n",
+		  aq->aqid, (u32)jiffies_to_msecs(jiffies - aq->stamp));
+
+	pos = (aq->q.prod - 1) & aq->q.mask;
+	if (pos == aq->q.cons)
+		goto out;
+
+	ibdev_warn(&dev->ibdev, "admin pos %u (last posted)\n", pos);
+	print_hex_dump(KERN_WARNING, "cmd ", DUMP_PREFIX_OFFSET, 16, 1,
+		       ionic_queue_at(&aq->q, pos),
+		       BIT(aq->q.stride_log2), true);
+
+out:
+	spin_unlock_irqrestore(&aq->lock, irqflags);
+}
+
+static void ionic_admin_reset_dwork(struct ionic_ibdev *dev)
+{
+	if (atomic_read(&dev->admin_state) == IONIC_ADMIN_KILLED)
+		return;
+
+	queue_delayed_work(ionic_evt_workq, &dev->admin_dwork,
+			   IONIC_ADMIN_DELAY);
+}
+
+static void ionic_admin_reset_wdog(struct ionic_aq *aq)
+{
+	if (atomic_read(&aq->admin_state) == IONIC_ADMIN_KILLED)
+		return;
+
+	aq->stamp = jiffies;
+	ionic_admin_reset_dwork(aq->dev);
+}
+
+static bool ionic_admin_next_cqe(struct ionic_ibdev *dev, struct ionic_cq *cq,
+				 struct ionic_v1_cqe **cqe)
+{
+	struct ionic_v1_cqe *qcqe = ionic_queue_at_prod(&cq->q);
+
+	if (unlikely(cq->color != ionic_v1_cqe_color(qcqe)))
+		return false;
+
+	/* Prevent out-of-order reads of the CQE */
+	dma_rmb();
+	*cqe = qcqe;
+
+	return true;
+}
+
+static void ionic_admin_poll_locked(struct ionic_aq *aq)
+{
+	struct ionic_cq *cq = &aq->vcq->cq[0];
+	struct ionic_admin_wr *wr, *wr_next;
+	struct ionic_ibdev *dev = aq->dev;
+	u32 wr_strides, avlbl_strides;
+	struct ionic_v1_cqe *cqe;
+	u32 qtf, qid;
+	u16 old_prod;
+	u8 type;
+
+	lockdep_assert_held(&aq->lock);
+
+	if (atomic_read(&aq->admin_state) == IONIC_ADMIN_KILLED) {
+		list_for_each_entry_safe(wr, wr_next, &aq->wr_prod, aq_ent) {
+			INIT_LIST_HEAD(&wr->aq_ent);
+			aq->q_wr[wr->status].wr = NULL;
+			wr->status = atomic_read(&aq->admin_state);
+			complete_all(&wr->work);
+		}
+		INIT_LIST_HEAD(&aq->wr_prod);
+
+		list_for_each_entry_safe(wr, wr_next, &aq->wr_post, aq_ent) {
+			INIT_LIST_HEAD(&wr->aq_ent);
+			wr->status = atomic_read(&aq->admin_state);
+			complete_all(&wr->work);
+		}
+		INIT_LIST_HEAD(&aq->wr_post);
+
+		return;
+	}
+
+	old_prod = cq->q.prod;
+
+	while (ionic_admin_next_cqe(dev, cq, &cqe)) {
+		qtf = ionic_v1_cqe_qtf(cqe);
+		qid = ionic_v1_cqe_qtf_qid(qtf);
+		type = ionic_v1_cqe_qtf_type(qtf);
+
+		if (unlikely(type != IONIC_V1_CQE_TYPE_ADMIN)) {
+			ibdev_warn_ratelimited(&dev->ibdev,
+					       "bad cqe type %u\n", type);
+			goto cq_next;
+		}
+
+		if (unlikely(qid != aq->aqid)) {
+			ibdev_warn_ratelimited(&dev->ibdev,
+					       "bad cqe qid %u\n", qid);
+			goto cq_next;
+		}
+
+		if (unlikely(be16_to_cpu(cqe->admin.cmd_idx) != aq->q.cons)) {
+			ibdev_warn_ratelimited(&dev->ibdev,
+					       "bad idx %u cons %u qid %u\n",
+					       be16_to_cpu(cqe->admin.cmd_idx),
+					       aq->q.cons, qid);
+			goto cq_next;
+		}
+
+		if (unlikely(ionic_queue_empty(&aq->q))) {
+			ibdev_warn_ratelimited(&dev->ibdev,
+					       "bad cqe for empty adminq\n");
+			goto cq_next;
+		}
+
+		wr = aq->q_wr[aq->q.cons].wr;
+		if (wr) {
+			aq->q_wr[aq->q.cons].wr = NULL;
+			list_del_init(&wr->aq_ent);
+
+			wr->cqe = *cqe;
+			wr->status = atomic_read(&aq->admin_state);
+			complete_all(&wr->work);
+		}
+
+		ionic_queue_consume_entries(&aq->q,
+					    aq->q_wr[aq->q.cons].wqe_strides);
+
+cq_next:
+		ionic_queue_produce(&cq->q);
+		cq->color = ionic_color_wrap(cq->q.prod, cq->color);
+	}
+
+	if (old_prod != cq->q.prod) {
+		ionic_admin_reset_wdog(aq);
+		cq->q.cons = cq->q.prod;
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype,
+				 ionic_queue_dbell_val(&cq->q));
+		queue_work(ionic_evt_workq, &aq->work);
+	} else if (!aq->armed) {
+		aq->armed = true;
+		cq->arm_any_prod = ionic_queue_next(&cq->q, cq->arm_any_prod);
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype,
+				 cq->q.dbell | IONIC_CQ_RING_ARM |
+				 cq->arm_any_prod);
+		queue_work(ionic_evt_workq, &aq->work);
+	}
+
+	if (atomic_read(&aq->admin_state) != IONIC_ADMIN_ACTIVE)
+		return;
+
+	old_prod = aq->q.prod;
+
+	if (ionic_queue_empty(&aq->q) && !list_empty(&aq->wr_post))
+		ionic_admin_reset_wdog(aq);
+
+	if (list_empty(&aq->wr_post))
+		return;
+
+	do {
+		u8 *src;
+		int i, src_len;
+		size_t stride_len;
+
+		wr = list_first_entry(&aq->wr_post, struct ionic_admin_wr,
+				      aq_ent);
+		wr_strides = (le16_to_cpu(wr->wqe.len) + ADMIN_WQE_HDR_LEN +
+			     (ADMIN_WQE_STRIDE - 1)) >> aq->q.stride_log2;
+		avlbl_strides = ionic_queue_length_remaining(&aq->q);
+
+		if (wr_strides > avlbl_strides)
+			break;
+
+		list_move(&wr->aq_ent, &aq->wr_prod);
+		wr->status = aq->q.prod;
+		aq->q_wr[aq->q.prod].wr = wr;
+		aq->q_wr[aq->q.prod].wqe_strides = wr_strides;
+
+		src_len = le16_to_cpu(wr->wqe.len);
+		src = (uint8_t *)&wr->wqe.cmd;
+
+		/* First stride */
+		memcpy(ionic_queue_at_prod(&aq->q), &wr->wqe,
+		       ADMIN_WQE_HDR_LEN);
+		stride_len = ADMIN_WQE_STRIDE - ADMIN_WQE_HDR_LEN;
+		if (stride_len > src_len)
+			stride_len = src_len;
+		memcpy(ionic_queue_at_prod(&aq->q) + ADMIN_WQE_HDR_LEN,
+		       src, stride_len);
+		ibdev_dbg(&dev->ibdev, "post admin prod %u (%u strides)\n",
+			  aq->q.prod, wr_strides);
+		print_hex_dump_debug("wqe ", DUMP_PREFIX_OFFSET, 16, 1,
+				     ionic_queue_at_prod(&aq->q),
+				     BIT(aq->q.stride_log2), true);
+		ionic_queue_produce(&aq->q);
+
+		/* Remaining strides */
+		for (i = stride_len; i < src_len; i += stride_len) {
+			stride_len = ADMIN_WQE_STRIDE;
+
+			if (i + stride_len > src_len)
+				stride_len = src_len - i;
+
+			memcpy(ionic_queue_at_prod(&aq->q), src + i,
+			       stride_len);
+			print_hex_dump_debug("wqe ", DUMP_PREFIX_OFFSET, 16, 1,
+					     ionic_queue_at_prod(&aq->q),
+					     BIT(aq->q.stride_log2), true);
+			ionic_queue_produce(&aq->q);
+		}
+	} while (!list_empty(&aq->wr_post));
+
+	if (old_prod != aq->q.prod)
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.aq_qtype,
+				 ionic_queue_dbell_val(&aq->q));
+}
+
+static void ionic_admin_dwork(struct work_struct *ws)
+{
+	struct ionic_ibdev *dev =
+		container_of(ws, struct ionic_ibdev, admin_dwork.work);
+	struct ionic_aq *aq, *bad_aq = NULL;
+	bool do_reschedule = false;
+	unsigned long irqflags;
+	bool do_reset = false;
+	u16 pos;
+	int i;
+
+	for (i = 0; i < dev->lif_cfg.aq_count; i++) {
+		aq = dev->aq_vec[i];
+
+		spin_lock_irqsave(&aq->lock, irqflags);
+
+		if (ionic_queue_empty(&aq->q))
+			goto next_aq;
+
+		/* Reschedule if any queue has outstanding work */
+		do_reschedule = true;
+
+		if (time_is_after_eq_jiffies(aq->stamp + IONIC_ADMIN_WARN))
+			/* Warning threshold not met, nothing to do */
+			goto next_aq;
+
+		/* See if polling now makes some progress */
+		pos = aq->q.cons;
+		ionic_admin_poll_locked(aq);
+		if (pos != aq->q.cons) {
+			ibdev_dbg(&dev->ibdev,
+				  "missed event for acq %d\n", aq->cqid);
+			goto next_aq;
+		}
+
+		if (time_is_after_eq_jiffies(aq->stamp +
+					     IONIC_ADMIN_TIMEOUT)) {
+			/* Timeout threshold not met */
+			ibdev_dbg(&dev->ibdev, "no progress after %ums\n",
+				  (u32)jiffies_to_msecs(jiffies - aq->stamp));
+			goto next_aq;
+		}
+
+		/* Queue timed out */
+		bad_aq = aq;
+		do_reset = true;
+next_aq:
+		spin_unlock_irqrestore(&aq->lock, irqflags);
+	}
+
+	if (do_reset)
+		/* Reset RDMA lif on a timeout */
+		ionic_admin_timedout(bad_aq);
+	else if (do_reschedule)
+		/* Try to poll again later */
+		ionic_admin_reset_dwork(dev);
+}
+
+static void ionic_admin_work(struct work_struct *ws)
+{
+	struct ionic_aq *aq = container_of(ws, struct ionic_aq, work);
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&aq->lock, irqflags);
+	ionic_admin_poll_locked(aq);
+	spin_unlock_irqrestore(&aq->lock, irqflags);
+}
+
+static void ionic_admin_post_aq(struct ionic_aq *aq, struct ionic_admin_wr *wr)
+{
+	unsigned long irqflags;
+	bool poll;
+
+	wr->status = IONIC_ADMIN_POSTED;
+	wr->aq = aq;
+
+	spin_lock_irqsave(&aq->lock, irqflags);
+	poll = list_empty(&aq->wr_post);
+	list_add(&wr->aq_ent, &aq->wr_post);
+	if (poll)
+		ionic_admin_poll_locked(aq);
+	spin_unlock_irqrestore(&aq->lock, irqflags);
+}
+
+void ionic_admin_post(struct ionic_ibdev *dev, struct ionic_admin_wr *wr)
+{
+	int aq_idx;
+
+	/* Use cpu id for the adminq selection */
+	aq_idx = raw_smp_processor_id() % dev->lif_cfg.aq_count;
+	ionic_admin_post_aq(dev->aq_vec[aq_idx], wr);
+}
+
+static void ionic_admin_cancel(struct ionic_admin_wr *wr)
+{
+	struct ionic_aq *aq = wr->aq;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&aq->lock, irqflags);
+
+	if (!list_empty(&wr->aq_ent)) {
+		list_del(&wr->aq_ent);
+		if (wr->status != IONIC_ADMIN_POSTED)
+			aq->q_wr[wr->status].wr = NULL;
+	}
+
+	spin_unlock_irqrestore(&aq->lock, irqflags);
+}
+
+static int ionic_admin_busy_wait(struct ionic_admin_wr *wr)
+{
+	struct ionic_aq *aq = wr->aq;
+	unsigned long irqflags;
+	int try_i;
+
+	for (try_i = 0; try_i < IONIC_ADMIN_BUSY_RETRY_COUNT; ++try_i) {
+		if (completion_done(&wr->work))
+			return 0;
+
+		mdelay(IONIC_ADMIN_BUSY_RETRY_MS);
+
+		spin_lock_irqsave(&aq->lock, irqflags);
+		ionic_admin_poll_locked(aq);
+		spin_unlock_irqrestore(&aq->lock, irqflags);
+	}
+
+	/*
+	 * we timed out. Initiate RDMA LIF reset and indicate
+	 * error to caller.
+	 */
+	ionic_admin_timedout(aq);
+	return -ETIMEDOUT;
+}
+
+int ionic_admin_wait(struct ionic_ibdev *dev, struct ionic_admin_wr *wr,
+		     enum ionic_admin_flags flags)
+{
+	int rc, timo;
+
+	if (flags & IONIC_ADMIN_F_BUSYWAIT) {
+		/* Spin */
+		rc = ionic_admin_busy_wait(wr);
+	} else if (flags & IONIC_ADMIN_F_INTERRUPT) {
+		/*
+		 * Interruptible sleep, 1s timeout
+		 * This is used for commands which are safe for the caller
+		 * to clean up without killing and resetting the adminq.
+		 */
+		timo = wait_for_completion_interruptible_timeout(&wr->work,
+								 HZ);
+		if (timo > 0)
+			rc = 0;
+		else if (timo == 0)
+			rc = -ETIMEDOUT;
+		else
+			rc = timo;
+	} else {
+		/*
+		 * Uninterruptible sleep
+		 * This is used for commands which are NOT safe for the
+		 * caller to clean up. Cleanup must be handled by the
+		 * adminq kill and reset process so that host memory is
+		 * not corrupted by the device.
+		 */
+		wait_for_completion(&wr->work);
+		rc = 0;
+	}
+
+	if (rc) {
+		ibdev_warn(&dev->ibdev, "wait status %d\n", rc);
+		ionic_admin_cancel(wr);
+	} else if (wr->status == IONIC_ADMIN_KILLED) {
+		ibdev_dbg(&dev->ibdev, "admin killed\n");
+
+		/* No error if admin already killed during teardown */
+		rc = (flags & IONIC_ADMIN_F_TEARDOWN) ? 0 : -ENODEV;
+	} else if (ionic_v1_cqe_error(&wr->cqe)) {
+		ibdev_warn(&dev->ibdev, "opcode %u error %u\n",
+			   wr->wqe.op,
+			   be32_to_cpu(wr->cqe.status_length));
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+static int ionic_rdma_devcmd(struct ionic_ibdev *dev,
+			     struct ionic_admin_ctx *admin)
+{
+	int rc;
+
+	rc = ionic_adminq_post_wait(dev->lif_cfg.lif, admin);
+	if (rc)
+		return rc;
+
+	return ionic_error_to_errno(admin->comp.comp.status);
+}
+
+int ionic_rdma_reset_devcmd(struct ionic_ibdev *dev)
+{
+	struct ionic_admin_ctx admin = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(admin.work),
+		.cmd.rdma_reset = {
+			.opcode = IONIC_CMD_RDMA_RESET_LIF,
+			.lif_index = cpu_to_le16(dev->lif_cfg.lif_index),
+		},
+	};
+
+	return ionic_rdma_devcmd(dev, &admin);
+}
+
+static int ionic_rdma_queue_devcmd(struct ionic_ibdev *dev,
+				   struct ionic_queue *q,
+				   u32 qid, u32 cid, u16 opcode)
+{
+	struct ionic_admin_ctx admin = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(admin.work),
+		.cmd.rdma_queue = {
+			.opcode = opcode,
+			.lif_index = cpu_to_le16(dev->lif_cfg.lif_index),
+			.qid_ver = cpu_to_le32(qid),
+			.cid = cpu_to_le32(cid),
+			.dbid = cpu_to_le16(dev->lif_cfg.dbid),
+			.depth_log2 = q->depth_log2,
+			.stride_log2 = q->stride_log2,
+			.dma_addr = cpu_to_le64(q->dma),
+		},
+	};
+
+	return ionic_rdma_devcmd(dev, &admin);
+}
+
+static void ionic_rdma_admincq_comp(struct ib_cq *ibcq, void *cq_context)
+{
+	struct ionic_aq *aq = cq_context;
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&aq->lock, irqflags);
+	aq->armed = false;
+	if (atomic_read(&aq->admin_state) < IONIC_ADMIN_KILLED)
+		queue_work(ionic_evt_workq, &aq->work);
+	spin_unlock_irqrestore(&aq->lock, irqflags);
+}
+
+static void ionic_rdma_admincq_event(struct ib_event *event, void *cq_context)
+{
+	struct ionic_aq *aq = cq_context;
+
+	ibdev_err(&aq->dev->ibdev, "admincq event %d\n", event->event);
+}
+
+static struct ionic_vcq *ionic_create_rdma_admincq(struct ionic_ibdev *dev,
+						   int comp_vector)
+{
+	struct ib_cq_init_attr attr = {
+		.cqe = IONIC_AQ_DEPTH,
+		.comp_vector = comp_vector,
+	};
+	struct ionic_tbl_buf buf = {};
+	struct ionic_vcq *vcq;
+	struct ionic_cq *cq;
+	int rc;
+
+	vcq = kzalloc(sizeof(*vcq), GFP_KERNEL);
+	if (!vcq)
+		return ERR_PTR(-ENOMEM);
+
+	vcq->ibcq.device = &dev->ibdev;
+	vcq->ibcq.comp_handler = ionic_rdma_admincq_comp;
+	vcq->ibcq.event_handler = ionic_rdma_admincq_event;
+	atomic_set(&vcq->ibcq.usecnt, 0);
+
+	vcq->udma_mask = 1;
+	cq = &vcq->cq[0];
+
+	rc = ionic_create_cq_common(vcq, &buf, &attr, NULL, NULL,
+				    NULL, NULL, 0);
+	if (rc)
+		goto err_init;
+
+	rc = ionic_rdma_queue_devcmd(dev, &cq->q, cq->cqid, cq->eqid,
+				     IONIC_CMD_RDMA_CREATE_CQ);
+	if (rc)
+		goto err_cmd;
+
+	return vcq;
+
+err_cmd:
+	ionic_destroy_cq_common(dev, cq);
+err_init:
+	kfree(vcq);
+
+	return ERR_PTR(rc);
+}
+
+static struct ionic_aq *__ionic_create_rdma_adminq(struct ionic_ibdev *dev,
+						   u32 aqid, u32 cqid)
+{
+	struct ionic_aq *aq;
+	int rc;
+
+	aq = kzalloc(sizeof(*aq), GFP_KERNEL);
+	if (!aq)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&aq->admin_state, IONIC_ADMIN_KILLED);
+	aq->dev = dev;
+	aq->aqid = aqid;
+	aq->cqid = cqid;
+	spin_lock_init(&aq->lock);
+
+	rc = ionic_queue_init(&aq->q, dev->lif_cfg.hwdev, IONIC_EQ_DEPTH,
+			      ADMIN_WQE_STRIDE);
+	if (rc)
+		goto err_q;
+
+	ionic_queue_dbell_init(&aq->q, aq->aqid);
+
+	aq->q_wr = kcalloc((u32)aq->q.mask + 1, sizeof(*aq->q_wr), GFP_KERNEL);
+	if (!aq->q_wr) {
+		rc = -ENOMEM;
+		goto err_wr;
+	}
+
+	INIT_LIST_HEAD(&aq->wr_prod);
+	INIT_LIST_HEAD(&aq->wr_post);
+
+	INIT_WORK(&aq->work, ionic_admin_work);
+	aq->armed = false;
+
+	return aq;
+
+err_wr:
+	ionic_queue_destroy(&aq->q, dev->lif_cfg.hwdev);
+err_q:
+	kfree(aq);
+
+	return ERR_PTR(rc);
+}
+
+static void __ionic_destroy_rdma_adminq(struct ionic_ibdev *dev,
+					struct ionic_aq *aq)
+{
+	kfree(aq->q_wr);
+	ionic_queue_destroy(&aq->q, dev->lif_cfg.hwdev);
+	kfree(aq);
+}
+
+static struct ionic_aq *ionic_create_rdma_adminq(struct ionic_ibdev *dev,
+						 u32 aqid, u32 cqid)
+{
+	struct ionic_aq *aq;
+	int rc;
+
+	aq = __ionic_create_rdma_adminq(dev, aqid, cqid);
+	if (IS_ERR(aq))
+		return aq;
+
+	rc = ionic_rdma_queue_devcmd(dev, &aq->q, aq->aqid, aq->cqid,
+				     IONIC_CMD_RDMA_CREATE_ADMINQ);
+	if (rc)
+		goto err_cmd;
+
+	return aq;
+
+err_cmd:
+	__ionic_destroy_rdma_adminq(dev, aq);
+
+	return ERR_PTR(rc);
+}
+
+static void ionic_flush_qs(struct ionic_ibdev *dev)
+{
+	struct ionic_qp *qp, *qp_tmp;
+	struct ionic_cq *cq, *cq_tmp;
+	LIST_HEAD(flush_list);
+	unsigned long index;
+
+	WARN_ON(!irqs_disabled());
+
+	/* Flush qp send and recv */
+	xa_lock(&dev->qp_tbl);
+	xa_for_each(&dev->qp_tbl, index, qp) {
+		kref_get(&qp->qp_kref);
+		list_add_tail(&qp->ibkill_flush_ent, &flush_list);
+	}
+	xa_unlock(&dev->qp_tbl);
+
+	list_for_each_entry_safe(qp, qp_tmp, &flush_list, ibkill_flush_ent) {
+		ionic_flush_qp(dev, qp);
+		kref_put(&qp->qp_kref, ionic_qp_complete);
+		list_del(&qp->ibkill_flush_ent);
+	}
+
+	/* Notify completions */
+	xa_lock(&dev->cq_tbl);
+	xa_for_each(&dev->cq_tbl, index, cq) {
+		kref_get(&cq->cq_kref);
+		list_add_tail(&cq->ibkill_flush_ent, &flush_list);
+	}
+	xa_unlock(&dev->cq_tbl);
+
+	list_for_each_entry_safe(cq, cq_tmp, &flush_list, ibkill_flush_ent) {
+		ionic_notify_flush_cq(cq);
+		kref_put(&cq->cq_kref, ionic_cq_complete);
+		list_del(&cq->ibkill_flush_ent);
+	}
+}
+
+static void ionic_kill_ibdev(struct ionic_ibdev *dev, bool fatal_path)
+{
+	unsigned long irqflags;
+	bool do_flush = false;
+	int i;
+
+	/* Mark AQs for drain and flush the QPs while irq is disabled */
+	local_irq_save(irqflags);
+
+	/* Mark the admin queue, flushing at most once */
+	for (i = 0; i < dev->lif_cfg.aq_count; i++) {
+		struct ionic_aq *aq = dev->aq_vec[i];
+
+		spin_lock(&aq->lock);
+		if (atomic_read(&aq->admin_state) != IONIC_ADMIN_KILLED) {
+			atomic_set(&aq->admin_state, IONIC_ADMIN_KILLED);
+			/* Flush incomplete admin commands */
+			ionic_admin_poll_locked(aq);
+			do_flush = true;
+		}
+		spin_unlock(&aq->lock);
+	}
+
+	if (do_flush)
+		ionic_flush_qs(dev);
+
+	local_irq_restore(irqflags);
+
+	/* Post a fatal event if requested */
+	if (fatal_path) {
+		struct ib_event ev;
+
+		ev.device = &dev->ibdev;
+		ev.element.port_num = 1;
+		ev.event = IB_EVENT_DEVICE_FATAL;
+
+		ib_dispatch_event(&ev);
+	}
+
+	atomic_set(&dev->admin_state, IONIC_ADMIN_KILLED);
+}
+
+void ionic_kill_rdma_admin(struct ionic_ibdev *dev, bool fatal_path)
+{
+	enum ionic_admin_state old_state;
+	unsigned long irqflags = 0;
+	int i, rc;
+
+	if (!dev->aq_vec)
+		return;
+
+	/*
+	 * Admin queues are transitioned from active to paused to killed state.
+	 * When in paused state, no new commands are issued to the device,
+	 * nor are any completed locally. After resetting the lif, it will be
+	 * safe to resume the rdma admin queues in the killed state. Commands
+	 * will not be issued to the device, but will complete locally with status
+	 * IONIC_ADMIN_KILLED. Handling completion will ensure that creating or
+	 * modifying resources fails, but destroying resources succeeds.
+	 * If there was a failure resetting the lif using this strategy,
+	 * then the state of the device is unknown.
+	 */
+	old_state = atomic_cmpxchg(&dev->admin_state, IONIC_ADMIN_ACTIVE,
+				   IONIC_ADMIN_PAUSED);
+	if (old_state != IONIC_ADMIN_ACTIVE)
+		return;
+
+	/* Pause all the AQs */
+	local_irq_save(irqflags);
+	for (i = 0; i < dev->lif_cfg.aq_count; i++) {
+		struct ionic_aq *aq = dev->aq_vec[i];
+
+		spin_lock(&aq->lock);
+		/* pause rdma admin queues to reset lif */
+		if (atomic_read(&aq->admin_state) == IONIC_ADMIN_ACTIVE)
+			atomic_set(&aq->admin_state, IONIC_ADMIN_PAUSED);
+		spin_unlock(&aq->lock);
+	}
+	local_irq_restore(irqflags);
+
+	rc = ionic_rdma_reset_devcmd(dev);
+	if (unlikely(rc)) {
+		ibdev_err(&dev->ibdev, "failed to reset rdma %d\n", rc);
+		ionic_request_rdma_reset(dev->lif_cfg.lif);
+	}
+
+	ionic_kill_ibdev(dev, fatal_path);
+}
+
+static void ionic_reset_work(struct work_struct *ws)
+{
+	struct ionic_ibdev *dev =
+		container_of(ws, struct ionic_ibdev, reset_work);
+
+	ionic_kill_rdma_admin(dev, true);
+}
+
+static bool ionic_next_eqe(struct ionic_eq *eq, struct ionic_v1_eqe *eqe)
+{
+	struct ionic_v1_eqe *qeqe;
+	bool color;
+
+	qeqe = ionic_queue_at_prod(&eq->q);
+	color = ionic_v1_eqe_color(qeqe);
+
+	/* cons is color for eq */
+	if (eq->q.cons != color)
+		return false;
+
+	/* Prevent out-of-order reads of the EQE */
+	dma_rmb();
+
+	ibdev_dbg(&eq->dev->ibdev, "poll eq prod %u\n", eq->q.prod);
+	print_hex_dump_debug("eqe ", DUMP_PREFIX_OFFSET, 16, 1,
+			     qeqe, BIT(eq->q.stride_log2), true);
+	*eqe = *qeqe;
+
+	return true;
+}
+
+static void ionic_cq_event(struct ionic_ibdev *dev, u32 cqid, u8 code)
+{
+	unsigned long irqflags;
+	struct ib_event ibev;
+	struct ionic_cq *cq;
+
+	xa_lock_irqsave(&dev->cq_tbl, irqflags);
+	cq = xa_load(&dev->cq_tbl, cqid);
+	if (cq)
+		kref_get(&cq->cq_kref);
+	xa_unlock_irqrestore(&dev->cq_tbl, irqflags);
+
+	if (!cq) {
+		ibdev_dbg(&dev->ibdev,
+			  "missing cqid %#x code %u\n", cqid, code);
+		return;
+	}
+
+	switch (code) {
+	case IONIC_V1_EQE_CQ_NOTIFY:
+		if (cq->vcq->ibcq.comp_handler)
+			cq->vcq->ibcq.comp_handler(&cq->vcq->ibcq,
+						   cq->vcq->ibcq.cq_context);
+		break;
+
+	case IONIC_V1_EQE_CQ_ERR:
+		if (cq->vcq->ibcq.event_handler) {
+			ibev.event = IB_EVENT_CQ_ERR;
+			ibev.device = &dev->ibdev;
+			ibev.element.cq = &cq->vcq->ibcq;
+
+			cq->vcq->ibcq.event_handler(&ibev,
+						    cq->vcq->ibcq.cq_context);
+		}
+		break;
+
+	default:
+		ibdev_dbg(&dev->ibdev,
+			  "unrecognized cqid %#x code %u\n", cqid, code);
+		break;
+	}
+
+	kref_put(&cq->cq_kref, ionic_cq_complete);
+}
+
+static void ionic_qp_event(struct ionic_ibdev *dev, u32 qpid, u8 code)
+{
+	unsigned long irqflags;
+	struct ib_event ibev;
+	struct ionic_qp *qp;
+
+	xa_lock_irqsave(&dev->qp_tbl, irqflags);
+	qp = xa_load(&dev->qp_tbl, qpid);
+	if (qp)
+		kref_get(&qp->qp_kref);
+	xa_unlock_irqrestore(&dev->qp_tbl, irqflags);
+
+	if (!qp) {
+		ibdev_dbg(&dev->ibdev,
+			  "missing qpid %#x code %u\n", qpid, code);
+		return;
+	}
+
+	ibev.device = &dev->ibdev;
+	ibev.element.qp = &qp->ibqp;
+
+	switch (code) {
+	case IONIC_V1_EQE_SQ_DRAIN:
+		ibev.event = IB_EVENT_SQ_DRAINED;
+		break;
+
+	case IONIC_V1_EQE_QP_COMM_EST:
+		ibev.event = IB_EVENT_COMM_EST;
+		break;
+
+	case IONIC_V1_EQE_QP_LAST_WQE:
+		ibev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		break;
+
+	case IONIC_V1_EQE_QP_ERR:
+		ibev.event = IB_EVENT_QP_FATAL;
+		break;
+
+	case IONIC_V1_EQE_QP_ERR_REQUEST:
+		ibev.event = IB_EVENT_QP_REQ_ERR;
+		break;
+
+	case IONIC_V1_EQE_QP_ERR_ACCESS:
+		ibev.event = IB_EVENT_QP_ACCESS_ERR;
+		break;
+
+	default:
+		ibdev_dbg(&dev->ibdev,
+			  "unrecognized qpid %#x code %u\n", qpid, code);
+		goto out;
+	}
+
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&ibev, qp->ibqp.qp_context);
+
+out:
+	kref_put(&qp->qp_kref, ionic_qp_complete);
+}
+
+static u16 ionic_poll_eq(struct ionic_eq *eq, u16 budget)
+{
+	struct ionic_ibdev *dev = eq->dev;
+	struct ionic_v1_eqe eqe;
+	u16 npolled = 0;
+	u8 type, code;
+	u32 evt, qid;
+
+	while (npolled < budget) {
+		if (!ionic_next_eqe(eq, &eqe))
+			break;
+
+		ionic_queue_produce(&eq->q);
+
+		/* cons is color for eq */
+		eq->q.cons = ionic_color_wrap(eq->q.prod, eq->q.cons);
+
+		++npolled;
+
+		evt = ionic_v1_eqe_evt(&eqe);
+		type = ionic_v1_eqe_evt_type(evt);
+		code = ionic_v1_eqe_evt_code(evt);
+		qid = ionic_v1_eqe_evt_qid(evt);
+
+		switch (type) {
+		case IONIC_V1_EQE_TYPE_CQ:
+			ionic_cq_event(dev, qid, code);
+			break;
+
+		case IONIC_V1_EQE_TYPE_QP:
+			ionic_qp_event(dev, qid, code);
+			break;
+
+		default:
+			ibdev_dbg(&dev->ibdev,
+				  "unknown event %#x type %u\n", evt, type);
+		}
+	}
+
+	return npolled;
+}
+
+static void ionic_poll_eq_work(struct work_struct *work)
+{
+	struct ionic_eq *eq = container_of(work, struct ionic_eq, work);
+	u32 npolled;
+
+	if (unlikely(!eq->enable) || WARN_ON(eq->armed))
+		return;
+
+	npolled = ionic_poll_eq(eq, IONIC_EQ_WORK_BUDGET);
+	if (npolled == IONIC_EQ_WORK_BUDGET) {
+		ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr,
+				   npolled, 0);
+		queue_work(ionic_evt_workq, &eq->work);
+	} else {
+		xchg(&eq->armed, 1);
+		ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr,
+				   0, IONIC_INTR_CRED_UNMASK);
+	}
+}
+
+static irqreturn_t ionic_poll_eq_isr(int irq, void *eqptr)
+{
+	struct ionic_eq *eq = eqptr;
+	int was_armed;
+	u32 npolled;
+
+	was_armed = xchg(&eq->armed, 0);
+
+	if (unlikely(!eq->enable) || !was_armed)
+		return IRQ_HANDLED;
+
+	npolled = ionic_poll_eq(eq, IONIC_EQ_ISR_BUDGET);
+	if (npolled == IONIC_EQ_ISR_BUDGET) {
+		ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr,
+				   npolled, 0);
+		queue_work(ionic_evt_workq, &eq->work);
+	} else {
+		xchg(&eq->armed, 1);
+		ionic_intr_credits(eq->dev->lif_cfg.intr_ctrl, eq->intr,
+				   0, IONIC_INTR_CRED_UNMASK);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct ionic_eq *ionic_create_eq(struct ionic_ibdev *dev, int eqid)
+{
+	struct ionic_intr_info intr_obj = { };
+	struct ionic_eq *eq;
+	int rc;
+
+	eq = kzalloc(sizeof(*eq), GFP_KERNEL);
+	if (!eq)
+		return ERR_PTR(-ENOMEM);
+
+	eq->dev = dev;
+
+	rc = ionic_queue_init(&eq->q, dev->lif_cfg.hwdev, IONIC_EQ_DEPTH,
+			      sizeof(struct ionic_v1_eqe));
+	if (rc)
+		goto err_q;
+
+	eq->eqid = eqid;
+
+	eq->armed = true;
+	eq->enable = false;
+	INIT_WORK(&eq->work, ionic_poll_eq_work);
+
+	rc = ionic_intr_alloc(dev->lif_cfg.lif, &intr_obj);
+	if (rc < 0)
+		goto err_intr;
+
+	eq->irq = intr_obj.vector;
+	eq->intr = intr_obj.index;
+
+	ionic_queue_dbell_init(&eq->q, eq->eqid);
+
+	/* cons is color for eq */
+	eq->q.cons = true;
+
+	snprintf(eq->name, sizeof(eq->name), "%s-%d-%d-eq",
+		 "ionr", dev->lif_cfg.lif_index, eq->eqid);
+
+	ionic_intr_mask(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_SET);
+	ionic_intr_mask_assert(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_SET);
+	ionic_intr_coal_init(dev->lif_cfg.intr_ctrl, eq->intr, 0);
+	ionic_intr_clean(dev->lif_cfg.intr_ctrl, eq->intr);
+
+	eq->enable = true;
+
+	rc = request_irq(eq->irq, ionic_poll_eq_isr, 0, eq->name, eq);
+	if (rc)
+		goto err_irq;
+
+	rc = ionic_rdma_queue_devcmd(dev, &eq->q, eq->eqid, eq->intr,
+				     IONIC_CMD_RDMA_CREATE_EQ);
+	if (rc)
+		goto err_cmd;
+
+	ionic_intr_mask(dev->lif_cfg.intr_ctrl, eq->intr, IONIC_INTR_MASK_CLEAR);
+
+	return eq;
+
+err_cmd:
+	eq->enable = false;
+	free_irq(eq->irq, eq);
+	flush_work(&eq->work);
+err_irq:
+	ionic_intr_free(dev->lif_cfg.lif, eq->intr);
+err_intr:
+	ionic_queue_destroy(&eq->q, dev->lif_cfg.hwdev);
+err_q:
+	kfree(eq);
+
+	return ERR_PTR(rc);
+}
+
+static void ionic_destroy_eq(struct ionic_eq *eq)
+{
+	struct ionic_ibdev *dev = eq->dev;
+
+	eq->enable = false;
+	free_irq(eq->irq, eq);
+	flush_work(&eq->work);
+
+	ionic_intr_free(dev->lif_cfg.lif, eq->intr);
+	ionic_queue_destroy(&eq->q, dev->lif_cfg.hwdev);
+	kfree(eq);
+}
+
+int ionic_create_rdma_admin(struct ionic_ibdev *dev)
+{
+	int eq_i = 0, aq_i = 0, rc = 0;
+	struct ionic_vcq *vcq;
+	struct ionic_aq *aq;
+	struct ionic_eq *eq;
+
+	dev->eq_vec = NULL;
+	dev->aq_vec = NULL;
+
+	INIT_WORK(&dev->reset_work, ionic_reset_work);
+	INIT_DELAYED_WORK(&dev->admin_dwork, ionic_admin_dwork);
+	atomic_set(&dev->admin_state, IONIC_ADMIN_KILLED);
+
+	if (dev->lif_cfg.aq_count > IONIC_AQ_COUNT) {
+		ibdev_dbg(&dev->ibdev, "limiting adminq count to %d\n",
+			  IONIC_AQ_COUNT);
+		dev->lif_cfg.aq_count = IONIC_AQ_COUNT;
+	}
+
+	if (dev->lif_cfg.eq_count > IONIC_EQ_COUNT) {
+		dev_dbg(&dev->ibdev.dev, "limiting eventq count to %d\n",
+			IONIC_EQ_COUNT);
+		dev->lif_cfg.eq_count = IONIC_EQ_COUNT;
+	}
+
+	/* need at least two eq and one aq */
+	if (dev->lif_cfg.eq_count < IONIC_EQ_COUNT_MIN ||
+	    dev->lif_cfg.aq_count < IONIC_AQ_COUNT_MIN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	dev->eq_vec = kmalloc_array(dev->lif_cfg.eq_count, sizeof(*dev->eq_vec),
+				    GFP_KERNEL);
+	if (!dev->eq_vec) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	for (eq_i = 0; eq_i < dev->lif_cfg.eq_count; ++eq_i) {
+		eq = ionic_create_eq(dev, eq_i + dev->lif_cfg.eq_base);
+		if (IS_ERR(eq)) {
+			rc = PTR_ERR(eq);
+
+			if (eq_i < IONIC_EQ_COUNT_MIN) {
+				ibdev_err(&dev->ibdev,
+					  "fail create eq %pe\n", eq);
+				goto out;
+			}
+
+			/* ok, just fewer eq than device supports */
+			ibdev_dbg(&dev->ibdev, "eq count %d want %d rc %pe\n",
+				  eq_i, dev->lif_cfg.eq_count, eq);
+
+			rc = 0;
+			break;
+		}
+
+		dev->eq_vec[eq_i] = eq;
+	}
+
+	dev->lif_cfg.eq_count = eq_i;
+
+	dev->aq_vec = kmalloc_array(dev->lif_cfg.aq_count, sizeof(*dev->aq_vec),
+				    GFP_KERNEL);
+	if (!dev->aq_vec) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Create one CQ per AQ */
+	for (aq_i = 0; aq_i < dev->lif_cfg.aq_count; ++aq_i) {
+		vcq = ionic_create_rdma_admincq(dev, aq_i % eq_i);
+		if (IS_ERR(vcq)) {
+			rc = PTR_ERR(vcq);
+
+			if (!aq_i) {
+				ibdev_err(&dev->ibdev,
+					  "failed to create acq %pe\n", vcq);
+				goto out;
+			}
+
+			/* ok, just fewer adminq than device supports */
+			ibdev_dbg(&dev->ibdev, "acq count %d want %d rc %pe\n",
+				  aq_i, dev->lif_cfg.aq_count, vcq);
+			break;
+		}
+
+		aq = ionic_create_rdma_adminq(dev, aq_i + dev->lif_cfg.aq_base,
+					      vcq->cq[0].cqid);
+		if (IS_ERR(aq)) {
+			/* Clean up the dangling CQ */
+			ionic_destroy_cq_common(dev, &vcq->cq[0]);
+			kfree(vcq);
+
+			rc = PTR_ERR(aq);
+
+			if (!aq_i) {
+				ibdev_err(&dev->ibdev,
+					  "failed to create aq %pe\n", aq);
+				goto out;
+			}
+
+			/* ok, just fewer adminq than device supports */
+			ibdev_dbg(&dev->ibdev, "aq count %d want %d rc %pe\n",
+				  aq_i, dev->lif_cfg.aq_count, aq);
+			break;
+		}
+
+		vcq->ibcq.cq_context = aq;
+		aq->vcq = vcq;
+
+		atomic_set(&aq->admin_state, IONIC_ADMIN_ACTIVE);
+		dev->aq_vec[aq_i] = aq;
+	}
+
+	atomic_set(&dev->admin_state, IONIC_ADMIN_ACTIVE);
+out:
+	dev->lif_cfg.eq_count = eq_i;
+	dev->lif_cfg.aq_count = aq_i;
+
+	return rc;
+}
+
+void ionic_destroy_rdma_admin(struct ionic_ibdev *dev)
+{
+	struct ionic_vcq *vcq;
+	struct ionic_aq *aq;
+	struct ionic_eq *eq;
+
+	/*
+	 * Killing the admin before destroy makes sure all admin and
+	 * completions are flushed. admin_state = IONIC_ADMIN_KILLED
+	 * stops queueing up further works.
+	 */
+	cancel_delayed_work_sync(&dev->admin_dwork);
+	cancel_work_sync(&dev->reset_work);
+
+	if (dev->aq_vec) {
+		while (dev->lif_cfg.aq_count > 0) {
+			aq = dev->aq_vec[--dev->lif_cfg.aq_count];
+			vcq = aq->vcq;
+
+			cancel_work_sync(&aq->work);
+
+			__ionic_destroy_rdma_adminq(dev, aq);
+			if (vcq) {
+				ionic_destroy_cq_common(dev, &vcq->cq[0]);
+				kfree(vcq);
+			}
+		}
+
+		kfree(dev->aq_vec);
+	}
+
+	if (dev->eq_vec) {
+		while (dev->lif_cfg.eq_count > 0) {
+			eq = dev->eq_vec[--dev->lif_cfg.eq_count];
+			ionic_destroy_eq(eq);
+		}
+
+		kfree(dev->eq_vec);
+	}
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_controlpath.c b/drivers/infiniband/hw/ionic/ionic_controlpath.c
new file mode 100644
index 000000000000..ea12d9b8e125
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_controlpath.c
@@ -0,0 +1,2679 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_user_verbs.h>
+#include <ionic_api.h>
+
+#include "ionic_fw.h"
+#include "ionic_ibdev.h"
+
+#define ionic_set_ecn(tos)   (((tos) | 2u) & ~1u)
+#define ionic_clear_ecn(tos)  ((tos) & ~3u)
+
+static int ionic_validate_qdesc(struct ionic_qdesc *q)
+{
+	if (!q->addr || !q->size || !q->mask ||
+	    !q->depth_log2 || !q->stride_log2)
+		return -EINVAL;
+
+	if (q->addr & (PAGE_SIZE - 1))
+		return -EINVAL;
+
+	if (q->mask != BIT(q->depth_log2) - 1)
+		return -EINVAL;
+
+	if (q->size < BIT_ULL(q->depth_log2 + q->stride_log2))
+		return -EINVAL;
+
+	return 0;
+}
+
+static u32 ionic_get_eqid(struct ionic_ibdev *dev, u32 comp_vector, u8 udma_idx)
+{
+	/* EQ per vector per udma, and the first eqs reserved for async events.
+	 * The rest of the vectors can be requested for completions.
+	 */
+	u32 comp_vec_count = dev->lif_cfg.eq_count / dev->lif_cfg.udma_count - 1;
+
+	return (comp_vector % comp_vec_count + 1) * dev->lif_cfg.udma_count + udma_idx;
+}
+
+static int ionic_get_cqid(struct ionic_ibdev *dev, u32 *cqid, u8 udma_idx)
+{
+	unsigned int size, base, bound;
+	int rc;
+
+	size = dev->lif_cfg.cq_count / dev->lif_cfg.udma_count;
+	base = size * udma_idx;
+	bound = base + size;
+
+	rc = ionic_resid_get_shared(&dev->inuse_cqid, base, bound);
+	if (rc >= 0) {
+		/* cq_base is zero or a multiple of two queue groups */
+		*cqid = dev->lif_cfg.cq_base +
+			ionic_bitid_to_qid(rc, dev->lif_cfg.udma_qgrp_shift,
+					   dev->half_cqid_udma_shift);
+
+		rc = 0;
+	}
+
+	return rc;
+}
+
+static void ionic_put_cqid(struct ionic_ibdev *dev, u32 cqid)
+{
+	u32 bitid = ionic_qid_to_bitid(cqid - dev->lif_cfg.cq_base,
+				       dev->lif_cfg.udma_qgrp_shift,
+				       dev->half_cqid_udma_shift);
+
+	ionic_resid_put(&dev->inuse_cqid, bitid);
+}
+
+int ionic_create_cq_common(struct ionic_vcq *vcq,
+			   struct ionic_tbl_buf *buf,
+			   const struct ib_cq_init_attr *attr,
+			   struct ionic_ctx *ctx,
+			   struct ib_udata *udata,
+			   struct ionic_qdesc *req_cq,
+			   __u32 *resp_cqid,
+			   int udma_idx)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(vcq->ibcq.device);
+	struct ionic_cq *cq = &vcq->cq[udma_idx];
+	void *entry;
+	int rc;
+
+	cq->vcq = vcq;
+
+	if (attr->cqe < 1 || attr->cqe + IONIC_CQ_GRACE > 0xffff) {
+		rc = -EINVAL;
+		goto err_args;
+	}
+
+	rc = ionic_get_cqid(dev, &cq->cqid, udma_idx);
+	if (rc)
+		goto err_args;
+
+	cq->eqid = ionic_get_eqid(dev, attr->comp_vector, udma_idx);
+
+	spin_lock_init(&cq->lock);
+	INIT_LIST_HEAD(&cq->poll_sq);
+	INIT_LIST_HEAD(&cq->flush_sq);
+	INIT_LIST_HEAD(&cq->flush_rq);
+
+	if (udata) {
+		rc = ionic_validate_qdesc(req_cq);
+		if (rc)
+			goto err_qdesc;
+
+		cq->umem = ib_umem_get(&dev->ibdev, req_cq->addr, req_cq->size,
+				       IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(cq->umem)) {
+			rc = PTR_ERR(cq->umem);
+			goto err_qdesc;
+		}
+
+		cq->q.ptr = NULL;
+		cq->q.size = req_cq->size;
+		cq->q.mask = req_cq->mask;
+		cq->q.depth_log2 = req_cq->depth_log2;
+		cq->q.stride_log2 = req_cq->stride_log2;
+
+		*resp_cqid = cq->cqid;
+	} else {
+		rc = ionic_queue_init(&cq->q, dev->lif_cfg.hwdev,
+				      attr->cqe + IONIC_CQ_GRACE,
+				      sizeof(struct ionic_v1_cqe));
+		if (rc)
+			goto err_q_init;
+
+		ionic_queue_dbell_init(&cq->q, cq->cqid);
+		cq->color = true;
+		cq->credit = cq->q.mask;
+	}
+
+	rc = ionic_pgtbl_init(dev, buf, cq->umem, cq->q.dma, 1, PAGE_SIZE);
+	if (rc)
+		goto err_pgtbl_init;
+
+	init_completion(&cq->cq_rel_comp);
+	kref_init(&cq->cq_kref);
+
+	entry = xa_store_irq(&dev->cq_tbl, cq->cqid, cq, GFP_KERNEL);
+	if (entry) {
+		if (!xa_is_err(entry))
+			rc = -EINVAL;
+		else
+			rc = xa_err(entry);
+
+		goto err_xa;
+	}
+
+	return 0;
+
+err_xa:
+	ionic_pgtbl_unbuf(dev, buf);
+err_pgtbl_init:
+	if (!udata)
+		ionic_queue_destroy(&cq->q, dev->lif_cfg.hwdev);
+err_q_init:
+	if (cq->umem)
+		ib_umem_release(cq->umem);
+err_qdesc:
+	ionic_put_cqid(dev, cq->cqid);
+err_args:
+	cq->vcq = NULL;
+
+	return rc;
+}
+
+void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq)
+{
+	if (!cq->vcq)
+		return;
+
+	xa_erase_irq(&dev->cq_tbl, cq->cqid);
+
+	kref_put(&cq->cq_kref, ionic_cq_complete);
+	wait_for_completion(&cq->cq_rel_comp);
+
+	if (cq->umem)
+		ib_umem_release(cq->umem);
+	else
+		ionic_queue_destroy(&cq->q, dev->lif_cfg.hwdev);
+
+	ionic_put_cqid(dev, cq->cqid);
+
+	cq->vcq = NULL;
+}
+
+static int ionic_validate_qdesc_zero(struct ionic_qdesc *q)
+{
+	if (q->addr || q->size || q->mask || q->depth_log2 || q->stride_log2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ionic_get_pdid(struct ionic_ibdev *dev, u32 *pdid)
+{
+	int rc;
+
+	rc = ionic_resid_get(&dev->inuse_pdid);
+	if (rc < 0)
+		return rc;
+
+	*pdid = rc;
+	return 0;
+}
+
+static int ionic_get_ahid(struct ionic_ibdev *dev, u32 *ahid)
+{
+	int rc;
+
+	rc = ionic_resid_get(&dev->inuse_ahid);
+	if (rc < 0)
+		return rc;
+
+	*ahid = rc;
+	return 0;
+}
+
+static int ionic_get_mrid(struct ionic_ibdev *dev, u32 *mrid)
+{
+	int rc;
+
+	/* wrap to 1, skip reserved lkey */
+	rc = ionic_resid_get_shared(&dev->inuse_mrid, 1,
+				    dev->inuse_mrid.inuse_size);
+	if (rc < 0)
+		return rc;
+
+	*mrid = ionic_mrid(rc, dev->next_mrkey++);
+	return 0;
+}
+
+static int ionic_get_gsi_qpid(struct ionic_ibdev *dev, u32 *qpid)
+{
+	int rc = 0;
+
+	rc = ionic_resid_get_shared(&dev->inuse_qpid, IB_QPT_GSI, IB_QPT_GSI + 1);
+	if (rc < 0)
+		return rc;
+
+	*qpid = IB_QPT_GSI;
+	return 0;
+}
+
+static int ionic_get_qpid(struct ionic_ibdev *dev, u32 *qpid,
+			  u8 *udma_idx, u8 udma_mask)
+{
+	unsigned int size, base, bound;
+	int udma_i, udma_x, udma_ix;
+	int rc = -EINVAL;
+
+	udma_x = dev->next_qpid_udma_idx;
+
+	dev->next_qpid_udma_idx ^= dev->lif_cfg.udma_count - 1;
+
+	for (udma_i = 0; udma_i < dev->lif_cfg.udma_count; ++udma_i) {
+		udma_ix = udma_i ^ udma_x;
+
+		if (!(udma_mask & BIT(udma_ix)))
+			continue;
+
+		size = dev->lif_cfg.qp_count / dev->lif_cfg.udma_count;
+		base = size * udma_ix;
+		bound = base + size;
+
+		/* skip reserved SMI and GSI qpids in group zero */
+		if (!base)
+			base = 2;
+
+		rc = ionic_resid_get_shared(&dev->inuse_qpid, base, bound);
+		if (rc >= 0) {
+			*qpid = ionic_bitid_to_qid(rc,
+						   dev->lif_cfg.udma_qgrp_shift,
+						   dev->half_qpid_udma_shift);
+			*udma_idx = udma_ix;
+
+			rc = 0;
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static int ionic_get_dbid(struct ionic_ibdev *dev, u32 *dbid, phys_addr_t *addr)
+{
+	int rc, dbpage_num;
+
+	/* wrap to 1, skip kernel reserved */
+	rc = ionic_resid_get_shared(&dev->inuse_dbid, 1,
+				    dev->inuse_dbid.inuse_size);
+	if (rc < 0)
+		return rc;
+
+	dbpage_num = (dev->lif_cfg.lif_hw_index * dev->lif_cfg.dbid_count) + rc;
+	*addr = dev->lif_cfg.db_phys + ((phys_addr_t)dbpage_num << PAGE_SHIFT);
+
+	*dbid = rc;
+
+	return 0;
+}
+
+static void ionic_put_pdid(struct ionic_ibdev *dev, u32 pdid)
+{
+	ionic_resid_put(&dev->inuse_pdid, pdid);
+}
+
+static void ionic_put_ahid(struct ionic_ibdev *dev, u32 ahid)
+{
+	ionic_resid_put(&dev->inuse_ahid, ahid);
+}
+
+static void ionic_put_mrid(struct ionic_ibdev *dev, u32 mrid)
+{
+	ionic_resid_put(&dev->inuse_mrid, ionic_mrid_index(mrid));
+}
+
+static void ionic_put_qpid(struct ionic_ibdev *dev, u32 qpid)
+{
+	u32 bitid = ionic_qid_to_bitid(qpid,
+				       dev->lif_cfg.udma_qgrp_shift,
+				       dev->half_qpid_udma_shift);
+
+	ionic_resid_put(&dev->inuse_qpid, bitid);
+}
+
+static void ionic_put_dbid(struct ionic_ibdev *dev, u32 dbid)
+{
+	ionic_resid_put(&dev->inuse_dbid, dbid);
+}
+
+static struct rdma_user_mmap_entry*
+ionic_mmap_entry_insert(struct ionic_ctx *ctx, unsigned long size,
+			unsigned long pfn, u8 mmap_flags, u64 *offset)
+{
+	struct ionic_mmap_entry *entry;
+	int rc;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+
+	entry->size = size;
+	entry->pfn = pfn;
+	entry->mmap_flags = mmap_flags;
+
+	rc = rdma_user_mmap_entry_insert(&ctx->ibctx, &entry->rdma_entry,
+					 entry->size);
+	if (rc) {
+		kfree(entry);
+		return NULL;
+	}
+
+	if (offset)
+		*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+	return &entry->rdma_entry;
+}
+
+int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device);
+	struct ionic_ctx *ctx = to_ionic_ctx(ibctx);
+	struct ionic_ctx_resp resp = {};
+	struct ionic_ctx_req req;
+	phys_addr_t db_phys = 0;
+	int rc;
+
+	rc = ib_copy_from_udata(&req, udata, sizeof(req));
+	if (rc)
+		return rc;
+
+	/* try to allocate dbid for user ctx */
+	rc = ionic_get_dbid(dev, &ctx->dbid, &db_phys);
+	if (rc < 0)
+		return rc;
+
+	ibdev_dbg(&dev->ibdev, "user space dbid %u\n", ctx->dbid);
+
+	ctx->mmap_dbell = ionic_mmap_entry_insert(ctx, PAGE_SIZE,
+						  PHYS_PFN(db_phys), 0, NULL);
+	if (!ctx->mmap_dbell) {
+		rc = -ENOMEM;
+		goto err_mmap_dbell;
+	}
+
+	resp.page_shift = PAGE_SHIFT;
+
+	resp.dbell_offset = db_phys & ~PAGE_MASK;
+
+	resp.version = dev->lif_cfg.rdma_version;
+	resp.qp_opcodes = dev->lif_cfg.qp_opcodes;
+	resp.admin_opcodes = dev->lif_cfg.admin_opcodes;
+
+	resp.sq_qtype = dev->lif_cfg.sq_qtype;
+	resp.rq_qtype = dev->lif_cfg.rq_qtype;
+	resp.cq_qtype = dev->lif_cfg.cq_qtype;
+	resp.admin_qtype = dev->lif_cfg.aq_qtype;
+	resp.max_stride = dev->lif_cfg.max_stride;
+	resp.max_spec = IONIC_SPEC_HIGH;
+
+	resp.udma_count = dev->lif_cfg.udma_count;
+	resp.expdb_mask = dev->lif_cfg.expdb_mask;
+
+	if (dev->lif_cfg.sq_expdb)
+		resp.expdb_qtypes |= IONIC_EXPDB_SQ;
+	if (dev->lif_cfg.rq_expdb)
+		resp.expdb_qtypes |= IONIC_EXPDB_RQ;
+
+	rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (rc)
+		goto err_resp;
+
+	return 0;
+
+err_resp:
+	rdma_user_mmap_entry_remove(ctx->mmap_dbell);
+err_mmap_dbell:
+	ionic_put_dbid(dev, ctx->dbid);
+
+	return rc;
+}
+
+void ionic_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device);
+	struct ionic_ctx *ctx = to_ionic_ctx(ibctx);
+
+	rdma_user_mmap_entry_remove(ctx->mmap_dbell);
+	ionic_put_dbid(dev, ctx->dbid);
+}
+
+int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibctx->device);
+	struct ionic_ctx *ctx = to_ionic_ctx(ibctx);
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct ionic_mmap_entry *ionic_entry;
+	int rc = 0;
+
+	rdma_entry = rdma_user_mmap_entry_get(&ctx->ibctx, vma);
+	if (!rdma_entry) {
+		ibdev_dbg(&dev->ibdev, "not found %#lx\n",
+			  vma->vm_pgoff << PAGE_SHIFT);
+		return -EINVAL;
+	}
+
+	ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry,
+				   rdma_entry);
+
+	ibdev_dbg(&dev->ibdev, "writecombine? %d\n",
+		  ionic_entry->mmap_flags & IONIC_MMAP_WC);
+	if (ionic_entry->mmap_flags & IONIC_MMAP_WC)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	else
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	ibdev_dbg(&dev->ibdev, "remap st %#lx pf %#lx sz %#lx\n",
+		  vma->vm_start, ionic_entry->pfn, ionic_entry->size);
+	rc = rdma_user_mmap_io(&ctx->ibctx, vma, ionic_entry->pfn,
+			       ionic_entry->size, vma->vm_page_prot,
+			       rdma_entry);
+	if (rc)
+		ibdev_dbg(&dev->ibdev, "remap failed %d\n", rc);
+
+	rdma_user_mmap_entry_put(rdma_entry);
+	return rc;
+}
+
+void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+	struct ionic_mmap_entry *ionic_entry;
+
+	ionic_entry = container_of(rdma_entry, struct ionic_mmap_entry,
+				   rdma_entry);
+	kfree(ionic_entry);
+}
+
+int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device);
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+
+	return ionic_get_pdid(dev, &pd->pdid);
+}
+
+int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device);
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+
+	ionic_put_pdid(dev, pd->pdid);
+
+	return 0;
+}
+
+static int ionic_build_hdr(struct ionic_ibdev *dev,
+			   struct ib_ud_header *hdr,
+			   const struct rdma_ah_attr *attr,
+			   u16 sport, bool want_ecn)
+{
+	const struct ib_global_route *grh;
+	enum rdma_network_type net;
+	u16 vlan;
+	int rc;
+
+	if (attr->ah_flags != IB_AH_GRH)
+		return -EINVAL;
+	if (attr->type != RDMA_AH_ATTR_TYPE_ROCE)
+		return -EINVAL;
+
+	grh = rdma_ah_read_grh(attr);
+
+	rc = rdma_read_gid_l2_fields(grh->sgid_attr, &vlan, &hdr->eth.smac_h[0]);
+	if (rc)
+		return rc;
+
+	net = rdma_gid_attr_network_type(grh->sgid_attr);
+
+	rc = ib_ud_header_init(0,	/* no payload */
+			       0,	/* no lrh */
+			       1,	/* yes eth */
+			       vlan != 0xffff,
+			       0,	/* no grh */
+			       net == RDMA_NETWORK_IPV4 ? 4 : 6,
+			       1,	/* yes udp */
+			       0,	/* no imm */
+			       hdr);
+	if (rc)
+		return rc;
+
+	ether_addr_copy(hdr->eth.dmac_h, attr->roce.dmac);
+
+	if (net == RDMA_NETWORK_IPV4) {
+		hdr->eth.type = cpu_to_be16(ETH_P_IP);
+		hdr->ip4.frag_off = cpu_to_be16(0x4000); /* don't fragment */
+		hdr->ip4.ttl = grh->hop_limit;
+		hdr->ip4.tot_len = cpu_to_be16(0xffff);
+		hdr->ip4.saddr =
+			*(const __be32 *)(grh->sgid_attr->gid.raw + 12);
+		hdr->ip4.daddr = *(const __be32 *)(grh->dgid.raw + 12);
+
+		if (want_ecn)
+			hdr->ip4.tos = ionic_set_ecn(grh->traffic_class);
+		else
+			hdr->ip4.tos = ionic_clear_ecn(grh->traffic_class);
+	} else {
+		hdr->eth.type = cpu_to_be16(ETH_P_IPV6);
+		hdr->grh.flow_label = cpu_to_be32(grh->flow_label);
+		hdr->grh.hop_limit = grh->hop_limit;
+		hdr->grh.source_gid = grh->sgid_attr->gid;
+		hdr->grh.destination_gid = grh->dgid;
+
+		if (want_ecn)
+			hdr->grh.traffic_class =
+				ionic_set_ecn(grh->traffic_class);
+		else
+			hdr->grh.traffic_class =
+				ionic_clear_ecn(grh->traffic_class);
+	}
+
+	if (vlan != 0xffff) {
+		vlan |= rdma_ah_get_sl(attr) << VLAN_PRIO_SHIFT;
+		hdr->vlan.tag = cpu_to_be16(vlan);
+		hdr->vlan.type = hdr->eth.type;
+		hdr->eth.type = cpu_to_be16(ETH_P_8021Q);
+	}
+
+	hdr->udp.sport = cpu_to_be16(sport);
+	hdr->udp.dport = cpu_to_be16(ROCE_V2_UDP_DPORT);
+
+	return 0;
+}
+
+static void ionic_set_ah_attr(struct ionic_ibdev *dev,
+			      struct rdma_ah_attr *ah_attr,
+			      struct ib_ud_header *hdr,
+			      int sgid_index)
+{
+	u32 flow_label;
+	u16 vlan = 0;
+	u8  tos, ttl;
+
+	if (hdr->vlan_present)
+		vlan = be16_to_cpu(hdr->vlan.tag);
+
+	if (hdr->ipv4_present) {
+		flow_label = 0;
+		ttl = hdr->ip4.ttl;
+		tos = hdr->ip4.tos;
+		*(__be16 *)(hdr->grh.destination_gid.raw + 10) = cpu_to_be16(0xffff);
+		*(__be32 *)(hdr->grh.destination_gid.raw + 12) = hdr->ip4.daddr;
+	} else {
+		flow_label = be32_to_cpu(hdr->grh.flow_label);
+		ttl = hdr->grh.hop_limit;
+		tos = hdr->grh.traffic_class;
+	}
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+	ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE;
+	if (hdr->eth_present)
+		ether_addr_copy(ah_attr->roce.dmac, hdr->eth.dmac_h);
+	rdma_ah_set_sl(ah_attr, vlan >> VLAN_PRIO_SHIFT);
+	rdma_ah_set_port_num(ah_attr, 1);
+	rdma_ah_set_grh(ah_attr, NULL, flow_label, sgid_index, ttl, tos);
+	rdma_ah_set_dgid_raw(ah_attr, &hdr->grh.destination_gid);
+}
+
+static int ionic_create_ah_cmd(struct ionic_ibdev *dev,
+			       struct ionic_ah *ah,
+			       struct ionic_pd *pd,
+			       struct rdma_ah_attr *attr,
+			       u32 flags)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_CREATE_AH,
+			.len = cpu_to_le16(IONIC_ADMIN_CREATE_AH_IN_V1_LEN),
+			.cmd.create_ah = {
+				.pd_id = cpu_to_le32(pd->pdid),
+				.dbid_flags = cpu_to_le16(dev->lif_cfg.dbid),
+				.id_ver = cpu_to_le32(ah->ahid),
+			}
+		}
+	};
+	enum ionic_admin_flags admin_flags = 0;
+	dma_addr_t hdr_dma = 0;
+	void *hdr_buf;
+	gfp_t gfp = GFP_ATOMIC;
+	int rc, hdr_len = 0;
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_AH)
+		return -EBADRQC;
+
+	if (flags & RDMA_CREATE_AH_SLEEPABLE)
+		gfp = GFP_KERNEL;
+	else
+		admin_flags |= IONIC_ADMIN_F_BUSYWAIT;
+
+	rc = ionic_build_hdr(dev, &ah->hdr, attr, IONIC_ROCE_UDP_SPORT, false);
+	if (rc)
+		return rc;
+
+	if (ah->hdr.eth.type == cpu_to_be16(ETH_P_8021Q)) {
+		if (ah->hdr.vlan.type == cpu_to_be16(ETH_P_IP))
+			wr.wqe.cmd.create_ah.csum_profile =
+				IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP;
+		else
+			wr.wqe.cmd.create_ah.csum_profile =
+				IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP;
+	} else {
+		if (ah->hdr.eth.type == cpu_to_be16(ETH_P_IP))
+			wr.wqe.cmd.create_ah.csum_profile =
+				IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP;
+		else
+			wr.wqe.cmd.create_ah.csum_profile =
+				IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP;
+	}
+
+	ah->sgid_index = rdma_ah_read_grh(attr)->sgid_index;
+
+	hdr_buf = kmalloc(PAGE_SIZE, gfp);
+	if (!hdr_buf)
+		return -ENOMEM;
+
+	hdr_len = ib_ud_header_pack(&ah->hdr, hdr_buf);
+	hdr_len -= IB_BTH_BYTES;
+	hdr_len -= IB_DETH_BYTES;
+	ibdev_dbg(&dev->ibdev, "roce packet header template\n");
+	print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1,
+			     hdr_buf, hdr_len, true);
+
+	hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len,
+				 DMA_TO_DEVICE);
+
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma);
+	if (rc)
+		goto err_dma;
+
+	wr.wqe.cmd.create_ah.dma_addr = cpu_to_le64(hdr_dma);
+	wr.wqe.cmd.create_ah.length = cpu_to_le32(hdr_len);
+
+	ionic_admin_post(dev, &wr);
+	rc = ionic_admin_wait(dev, &wr, admin_flags);
+
+	dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len,
+			 DMA_TO_DEVICE);
+err_dma:
+	kfree(hdr_buf);
+
+	return rc;
+}
+
+static int ionic_destroy_ah_cmd(struct ionic_ibdev *dev, u32 ahid, u32 flags)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_DESTROY_AH,
+			.len = cpu_to_le16(IONIC_ADMIN_DESTROY_AH_IN_V1_LEN),
+			.cmd.destroy_ah = {
+				.ah_id = cpu_to_le32(ahid),
+			},
+		}
+	};
+	enum ionic_admin_flags admin_flags = IONIC_ADMIN_F_TEARDOWN;
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_AH)
+		return -EBADRQC;
+
+	if (!(flags & RDMA_CREATE_AH_SLEEPABLE))
+		admin_flags |= IONIC_ADMIN_F_BUSYWAIT;
+
+	ionic_admin_post(dev, &wr);
+	ionic_admin_wait(dev, &wr, admin_flags);
+
+	/* No host-memory resource is associated with ah, so it is ok
+	 * to "succeed" and complete this destroy ah on the host.
+	 */
+	return 0;
+}
+
+int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+		    struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device);
+	struct rdma_ah_attr *attr = init_attr->ah_attr;
+	struct ionic_pd *pd = to_ionic_pd(ibah->pd);
+	struct ionic_ah *ah = to_ionic_ah(ibah);
+	struct ionic_ah_resp resp = {};
+	u32 flags = init_attr->flags;
+	int rc;
+
+	rc = ionic_get_ahid(dev, &ah->ahid);
+	if (rc)
+		return rc;
+
+	rc = ionic_create_ah_cmd(dev, ah, pd, attr, flags);
+	if (rc)
+		goto err_cmd;
+
+	if (udata) {
+		resp.ahid = ah->ahid;
+
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc)
+			goto err_resp;
+	}
+
+	return 0;
+
+err_resp:
+	ionic_destroy_ah_cmd(dev, ah->ahid, flags);
+err_cmd:
+	ionic_put_ahid(dev, ah->ahid);
+	return rc;
+}
+
+int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device);
+	struct ionic_ah *ah = to_ionic_ah(ibah);
+
+	ionic_set_ah_attr(dev, ah_attr, &ah->hdr, ah->sgid_index);
+
+	return 0;
+}
+
+int ionic_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibah->device);
+	struct ionic_ah *ah = to_ionic_ah(ibah);
+	int rc;
+
+	rc = ionic_destroy_ah_cmd(dev, ah->ahid, flags);
+	if (rc)
+		return rc;
+
+	ionic_put_ahid(dev, ah->ahid);
+
+	return 0;
+}
+
+static int ionic_create_mr_cmd(struct ionic_ibdev *dev,
+			       struct ionic_pd *pd,
+			       struct ionic_mr *mr,
+			       u64 addr,
+			       u64 length)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_CREATE_MR,
+			.len = cpu_to_le16(IONIC_ADMIN_CREATE_MR_IN_V1_LEN),
+			.cmd.create_mr = {
+				.va = cpu_to_le64(addr),
+				.length = cpu_to_le64(length),
+				.pd_id = cpu_to_le32(pd->pdid),
+				.page_size_log2 = mr->buf.page_size_log2,
+				.tbl_index = cpu_to_le32(~0),
+				.map_count = cpu_to_le32(mr->buf.tbl_pages),
+				.dma_addr = ionic_pgtbl_dma(&mr->buf, addr),
+				.dbid_flags = cpu_to_le16(mr->flags),
+				.id_ver = cpu_to_le32(mr->mrid),
+			}
+		}
+	};
+	int rc;
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_MR)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+	rc = ionic_admin_wait(dev, &wr, 0);
+	if (!rc)
+		mr->created = true;
+
+	return rc;
+}
+
+static int ionic_destroy_mr_cmd(struct ionic_ibdev *dev, u32 mrid)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_DESTROY_MR,
+			.len = cpu_to_le16(IONIC_ADMIN_DESTROY_MR_IN_V1_LEN),
+			.cmd.destroy_mr = {
+				.mr_id = cpu_to_le32(mrid),
+			},
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_MR)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN);
+}
+
+struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+	struct ionic_mr *mr;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->ibmr.lkey = IONIC_DMA_LKEY;
+	mr->ibmr.rkey = IONIC_DMA_RKEY;
+
+	if (pd)
+		pd->flags |= IONIC_QPF_PRIVILEGED;
+
+	return &mr->ibmr;
+}
+
+struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+				u64 addr, int access, struct ib_dmah *dmah,
+				struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device);
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+	struct ionic_mr *mr;
+	unsigned long pg_sz;
+	int rc;
+
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ionic_get_mrid(dev, &mr->mrid);
+	if (rc)
+		goto err_mrid;
+
+	mr->ibmr.lkey = mr->mrid;
+	mr->ibmr.rkey = mr->mrid;
+	mr->ibmr.iova = addr;
+	mr->ibmr.length = length;
+
+	mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access);
+
+	mr->umem = ib_umem_get(&dev->ibdev, start, length, access);
+	if (IS_ERR(mr->umem)) {
+		rc = PTR_ERR(mr->umem);
+		goto err_umem;
+	}
+
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->lif_cfg.page_size_supported,
+				       addr);
+	if (!pg_sz) {
+		rc = -EINVAL;
+		goto err_pgtbl;
+	}
+
+	rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz);
+	if (rc)
+		goto err_pgtbl;
+
+	rc = ionic_create_mr_cmd(dev, pd, mr, addr, length);
+	if (rc)
+		goto err_cmd;
+
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+
+	return &mr->ibmr;
+
+err_cmd:
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+err_pgtbl:
+	ib_umem_release(mr->umem);
+err_umem:
+	ionic_put_mrid(dev, mr->mrid);
+err_mrid:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset,
+				       u64 length, u64 addr, int fd, int access,
+				       struct ib_dmah *dmah,
+				       struct uverbs_attr_bundle *attrs)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device);
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+	struct ib_umem_dmabuf *umem_dmabuf;
+	struct ionic_mr *mr;
+	u64 pg_sz;
+	int rc;
+
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ionic_get_mrid(dev, &mr->mrid);
+	if (rc)
+		goto err_mrid;
+
+	mr->ibmr.lkey = mr->mrid;
+	mr->ibmr.rkey = mr->mrid;
+	mr->ibmr.iova = addr;
+	mr->ibmr.length = length;
+
+	mr->flags = IONIC_MRF_USER_MR | to_ionic_mr_flags(access);
+
+	umem_dmabuf = ib_umem_dmabuf_get_pinned(&dev->ibdev, offset, length,
+						fd, access);
+	if (IS_ERR(umem_dmabuf)) {
+		rc = PTR_ERR(umem_dmabuf);
+		goto err_umem;
+	}
+
+	mr->umem = &umem_dmabuf->umem;
+
+	pg_sz = ib_umem_find_best_pgsz(mr->umem,
+				       dev->lif_cfg.page_size_supported,
+				       addr);
+	if (!pg_sz) {
+		rc = -EINVAL;
+		goto err_pgtbl;
+	}
+
+	rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, 1, pg_sz);
+	if (rc)
+		goto err_pgtbl;
+
+	rc = ionic_create_mr_cmd(dev, pd, mr, addr, length);
+	if (rc)
+		goto err_cmd;
+
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+
+	return &mr->ibmr;
+
+err_cmd:
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+err_pgtbl:
+	ib_umem_release(mr->umem);
+err_umem:
+	ionic_put_mrid(dev, mr->mrid);
+err_mrid:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device);
+	struct ionic_mr *mr = to_ionic_mr(ibmr);
+	int rc;
+
+	if (!mr->ibmr.lkey)
+		goto out;
+
+	if (mr->created) {
+		rc = ionic_destroy_mr_cmd(dev, mr->mrid);
+		if (rc)
+			return rc;
+	}
+
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	ionic_put_mrid(dev, mr->mrid);
+
+out:
+	kfree(mr);
+
+	return 0;
+}
+
+struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type,
+			     u32 max_sg)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibpd->device);
+	struct ionic_pd *pd = to_ionic_pd(ibpd);
+	struct ionic_mr *mr;
+	int rc;
+
+	if (type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	rc = ionic_get_mrid(dev, &mr->mrid);
+	if (rc)
+		goto err_mrid;
+
+	mr->ibmr.lkey = mr->mrid;
+	mr->ibmr.rkey = mr->mrid;
+
+	mr->flags = IONIC_MRF_PHYS_MR;
+
+	rc = ionic_pgtbl_init(dev, &mr->buf, mr->umem, 0, max_sg, PAGE_SIZE);
+	if (rc)
+		goto err_pgtbl;
+
+	mr->buf.tbl_pages = 0;
+
+	rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0);
+	if (rc)
+		goto err_cmd;
+
+	return &mr->ibmr;
+
+err_cmd:
+	ionic_pgtbl_unbuf(dev, &mr->buf);
+err_pgtbl:
+	ionic_put_mrid(dev, mr->mrid);
+err_mrid:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+static int ionic_map_mr_page(struct ib_mr *ibmr, u64 dma)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device);
+	struct ionic_mr *mr = to_ionic_mr(ibmr);
+
+	ibdev_dbg(&dev->ibdev, "dma %p\n", (void *)dma);
+	return ionic_pgtbl_page(&mr->buf, dma);
+}
+
+int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		    unsigned int *sg_offset)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibmr->device);
+	struct ionic_mr *mr = to_ionic_mr(ibmr);
+	int rc;
+
+	/* mr must be allocated using ib_alloc_mr() */
+	if (unlikely(!mr->buf.tbl_limit))
+		return -EINVAL;
+
+	mr->buf.tbl_pages = 0;
+
+	if (mr->buf.tbl_buf)
+		dma_sync_single_for_cpu(dev->lif_cfg.hwdev, mr->buf.tbl_dma,
+					mr->buf.tbl_size, DMA_TO_DEVICE);
+
+	ibdev_dbg(&dev->ibdev, "sg %p nent %d\n", sg, sg_nents);
+	rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ionic_map_mr_page);
+
+	mr->buf.page_size_log2 = order_base_2(ibmr->page_size);
+
+	if (mr->buf.tbl_buf)
+		dma_sync_single_for_device(dev->lif_cfg.hwdev, mr->buf.tbl_dma,
+					   mr->buf.tbl_size, DMA_TO_DEVICE);
+
+	return rc;
+}
+
+int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device);
+	struct ionic_pd *pd = to_ionic_pd(ibmw->pd);
+	struct ionic_mr *mr = to_ionic_mw(ibmw);
+	int rc;
+
+	rc = ionic_get_mrid(dev, &mr->mrid);
+	if (rc)
+		return rc;
+
+	mr->ibmw.rkey = mr->mrid;
+
+	if (mr->ibmw.type == IB_MW_TYPE_1)
+		mr->flags = IONIC_MRF_MW_1;
+	else
+		mr->flags = IONIC_MRF_MW_2;
+
+	rc = ionic_create_mr_cmd(dev, pd, mr, 0, 0);
+	if (rc)
+		goto err_cmd;
+
+	return 0;
+
+err_cmd:
+	ionic_put_mrid(dev, mr->mrid);
+	return rc;
+}
+
+int ionic_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibmw->device);
+	struct ionic_mr *mr = to_ionic_mw(ibmw);
+	int rc;
+
+	rc = ionic_destroy_mr_cmd(dev, mr->mrid);
+	if (rc)
+		return rc;
+
+	ionic_put_mrid(dev, mr->mrid);
+
+	return 0;
+}
+
+static int ionic_create_cq_cmd(struct ionic_ibdev *dev,
+			       struct ionic_ctx *ctx,
+			       struct ionic_cq *cq,
+			       struct ionic_tbl_buf *buf)
+{
+	const u16 dbid = ionic_ctx_dbid(dev, ctx);
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_CREATE_CQ,
+			.len = cpu_to_le16(IONIC_ADMIN_CREATE_CQ_IN_V1_LEN),
+			.cmd.create_cq = {
+				.eq_id = cpu_to_le32(cq->eqid),
+				.depth_log2 = cq->q.depth_log2,
+				.stride_log2 = cq->q.stride_log2,
+				.page_size_log2 = buf->page_size_log2,
+				.tbl_index = cpu_to_le32(~0),
+				.map_count = cpu_to_le32(buf->tbl_pages),
+				.dma_addr = ionic_pgtbl_dma(buf, 0),
+				.dbid_flags = cpu_to_le16(dbid),
+				.id_ver = cpu_to_le32(cq->cqid),
+			}
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_CQ)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, 0);
+}
+
+static int ionic_destroy_cq_cmd(struct ionic_ibdev *dev, u32 cqid)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_DESTROY_CQ,
+			.len = cpu_to_le16(IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN),
+			.cmd.destroy_cq = {
+				.cq_id = cpu_to_le32(cqid),
+			},
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_CQ)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN);
+}
+
+int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		    struct uverbs_attr_bundle *attrs)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device);
+	struct ib_udata *udata = &attrs->driver_udata;
+	struct ionic_ctx *ctx =
+		rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibcq);
+	struct ionic_tbl_buf buf = {};
+	struct ionic_cq_resp resp;
+	struct ionic_cq_req req;
+	int udma_idx = 0, rc;
+
+	if (udata) {
+		rc = ib_copy_from_udata(&req, udata, sizeof(req));
+		if (rc)
+			return rc;
+	}
+
+	vcq->udma_mask = BIT(dev->lif_cfg.udma_count) - 1;
+
+	if (udata)
+		vcq->udma_mask &= req.udma_mask;
+
+	if (!vcq->udma_mask) {
+		rc = -EINVAL;
+		goto err_init;
+	}
+
+	for (; udma_idx < dev->lif_cfg.udma_count; ++udma_idx) {
+		if (!(vcq->udma_mask & BIT(udma_idx)))
+			continue;
+
+		rc = ionic_create_cq_common(vcq, &buf, attr, ctx, udata,
+					    &req.cq[udma_idx],
+					    &resp.cqid[udma_idx],
+					    udma_idx);
+		if (rc)
+			goto err_init;
+
+		rc = ionic_create_cq_cmd(dev, ctx, &vcq->cq[udma_idx], &buf);
+		if (rc)
+			goto err_cmd;
+
+		ionic_pgtbl_unbuf(dev, &buf);
+	}
+
+	vcq->ibcq.cqe = attr->cqe;
+
+	if (udata) {
+		resp.udma_mask = vcq->udma_mask;
+
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc)
+			goto err_resp;
+	}
+
+	return 0;
+
+err_resp:
+	while (udma_idx) {
+		--udma_idx;
+		if (!(vcq->udma_mask & BIT(udma_idx)))
+			continue;
+		ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid);
+err_cmd:
+		ionic_pgtbl_unbuf(dev, &buf);
+		ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]);
+err_init:
+		;
+	}
+
+	return rc;
+}
+
+int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibcq);
+	int udma_idx, rc_tmp, rc = 0;
+
+	for (udma_idx = dev->lif_cfg.udma_count; udma_idx; ) {
+		--udma_idx;
+
+		if (!(vcq->udma_mask & BIT(udma_idx)))
+			continue;
+
+		rc_tmp = ionic_destroy_cq_cmd(dev, vcq->cq[udma_idx].cqid);
+		if (rc_tmp) {
+			if (!rc)
+				rc = rc_tmp;
+
+			continue;
+		}
+
+		ionic_destroy_cq_common(dev, &vcq->cq[udma_idx]);
+	}
+
+	return rc;
+}
+
+static bool pd_remote_privileged(struct ib_pd *pd)
+{
+	return pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
+}
+
+static int ionic_create_qp_cmd(struct ionic_ibdev *dev,
+			       struct ionic_pd *pd,
+			       struct ionic_cq *send_cq,
+			       struct ionic_cq *recv_cq,
+			       struct ionic_qp *qp,
+			       struct ionic_tbl_buf *sq_buf,
+			       struct ionic_tbl_buf *rq_buf,
+			       struct ib_qp_init_attr *attr)
+{
+	const u16 dbid = ionic_obj_dbid(dev, pd->ibpd.uobject);
+	const u32 flags = to_ionic_qp_flags(0, 0,
+					    qp->sq_cmb & IONIC_CMB_ENABLE,
+					    qp->rq_cmb & IONIC_CMB_ENABLE,
+					    qp->sq_spec, qp->rq_spec,
+					    pd->flags & IONIC_QPF_PRIVILEGED,
+					    pd_remote_privileged(&pd->ibpd));
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_CREATE_QP,
+			.len = cpu_to_le16(IONIC_ADMIN_CREATE_QP_IN_V1_LEN),
+			.cmd.create_qp = {
+				.pd_id = cpu_to_le32(pd->pdid),
+				.priv_flags = cpu_to_be32(flags),
+				.type_state = to_ionic_qp_type(attr->qp_type),
+				.dbid_flags = cpu_to_le16(dbid),
+				.id_ver = cpu_to_le32(qp->qpid),
+			}
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_CREATE_QP)
+		return -EBADRQC;
+
+	if (qp->has_sq) {
+		wr.wqe.cmd.create_qp.sq_cq_id = cpu_to_le32(send_cq->cqid);
+		wr.wqe.cmd.create_qp.sq_depth_log2 = qp->sq.depth_log2;
+		wr.wqe.cmd.create_qp.sq_stride_log2 = qp->sq.stride_log2;
+		wr.wqe.cmd.create_qp.sq_page_size_log2 = sq_buf->page_size_log2;
+		wr.wqe.cmd.create_qp.sq_tbl_index_xrcd_id = cpu_to_le32(~0);
+		wr.wqe.cmd.create_qp.sq_map_count =
+			cpu_to_le32(sq_buf->tbl_pages);
+		wr.wqe.cmd.create_qp.sq_dma_addr = ionic_pgtbl_dma(sq_buf, 0);
+	}
+
+	if (qp->has_rq) {
+		wr.wqe.cmd.create_qp.rq_cq_id = cpu_to_le32(recv_cq->cqid);
+		wr.wqe.cmd.create_qp.rq_depth_log2 = qp->rq.depth_log2;
+		wr.wqe.cmd.create_qp.rq_stride_log2 = qp->rq.stride_log2;
+		wr.wqe.cmd.create_qp.rq_page_size_log2 = rq_buf->page_size_log2;
+		wr.wqe.cmd.create_qp.rq_tbl_index_srq_id = cpu_to_le32(~0);
+		wr.wqe.cmd.create_qp.rq_map_count =
+			cpu_to_le32(rq_buf->tbl_pages);
+		wr.wqe.cmd.create_qp.rq_dma_addr = ionic_pgtbl_dma(rq_buf, 0);
+	}
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, 0);
+}
+
+static int ionic_modify_qp_cmd(struct ionic_ibdev *dev,
+			       struct ionic_pd *pd,
+			       struct ionic_qp *qp,
+			       struct ib_qp_attr *attr,
+			       int mask)
+{
+	const u32 flags = to_ionic_qp_flags(attr->qp_access_flags,
+					    attr->en_sqd_async_notify,
+					    qp->sq_cmb & IONIC_CMB_ENABLE,
+					    qp->rq_cmb & IONIC_CMB_ENABLE,
+					    qp->sq_spec, qp->rq_spec,
+					    pd->flags & IONIC_QPF_PRIVILEGED,
+					    pd_remote_privileged(qp->ibqp.pd));
+	const u8 state = to_ionic_qp_modify_state(attr->qp_state,
+						  attr->cur_qp_state);
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_MODIFY_QP,
+			.len = cpu_to_le16(IONIC_ADMIN_MODIFY_QP_IN_V1_LEN),
+			.cmd.mod_qp = {
+				.attr_mask = cpu_to_be32(mask),
+				.access_flags = cpu_to_be16(flags),
+				.rq_psn = cpu_to_le32(attr->rq_psn),
+				.sq_psn = cpu_to_le32(attr->sq_psn),
+				.rate_limit_kbps =
+					cpu_to_le32(attr->rate_limit),
+				.pmtu = (attr->path_mtu + 7),
+				.retry = (attr->retry_cnt |
+					  (attr->rnr_retry << 4)),
+				.rnr_timer = attr->min_rnr_timer,
+				.retry_timeout = attr->timeout,
+				.type_state = state,
+				.id_ver = cpu_to_le32(qp->qpid),
+			}
+		}
+	};
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	void *hdr_buf = NULL;
+	dma_addr_t hdr_dma = 0;
+	int rc, hdr_len = 0;
+	u16 sport;
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_MODIFY_QP)
+		return -EBADRQC;
+
+	if ((mask & IB_QP_MAX_DEST_RD_ATOMIC) && attr->max_dest_rd_atomic) {
+		/* Note, round up/down was already done for allocating
+		 * resources on the device. The allocation order is in cache
+		 * line size.  We can't use the order of the resource
+		 * allocation to determine the order wqes here, because for
+		 * queue length <= one cache line it is not distinct.
+		 *
+		 * Therefore, order wqes is computed again here.
+		 *
+		 * Account for hole and round up to the next order.
+		 */
+		wr.wqe.cmd.mod_qp.rsq_depth =
+			order_base_2(attr->max_dest_rd_atomic + 1);
+		wr.wqe.cmd.mod_qp.rsq_index = cpu_to_le32(~0);
+	}
+
+	if ((mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
+		/* Account for hole and round down to the next order */
+		wr.wqe.cmd.mod_qp.rrq_depth =
+			order_base_2(attr->max_rd_atomic + 2) - 1;
+		wr.wqe.cmd.mod_qp.rrq_index = cpu_to_le32(~0);
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+		wr.wqe.cmd.mod_qp.qkey_dest_qpn =
+			cpu_to_le32(attr->dest_qp_num);
+	else
+		wr.wqe.cmd.mod_qp.qkey_dest_qpn = cpu_to_le32(attr->qkey);
+
+	if (mask & IB_QP_AV) {
+		if (!qp->hdr)
+			return -ENOMEM;
+
+		sport = rdma_get_udp_sport(grh->flow_label,
+					   qp->qpid,
+					   attr->dest_qp_num);
+
+		rc = ionic_build_hdr(dev, qp->hdr, &attr->ah_attr, sport, true);
+		if (rc)
+			return rc;
+
+		qp->sgid_index = grh->sgid_index;
+
+		hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!hdr_buf)
+			return -ENOMEM;
+
+		hdr_len = ib_ud_header_pack(qp->hdr, hdr_buf);
+		hdr_len -= IB_BTH_BYTES;
+		hdr_len -= IB_DETH_BYTES;
+		ibdev_dbg(&dev->ibdev, "roce packet header template\n");
+		print_hex_dump_debug("hdr ", DUMP_PREFIX_OFFSET, 16, 1,
+				     hdr_buf, hdr_len, true);
+
+		hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf, hdr_len,
+					 DMA_TO_DEVICE);
+
+		rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma);
+		if (rc)
+			goto err_dma;
+
+		if (qp->hdr->ipv4_present) {
+			wr.wqe.cmd.mod_qp.tfp_csum_profile =
+				qp->hdr->vlan_present ?
+					IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP :
+					IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP;
+		} else {
+			wr.wqe.cmd.mod_qp.tfp_csum_profile =
+				qp->hdr->vlan_present ?
+					IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP :
+					IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP;
+		}
+
+		wr.wqe.cmd.mod_qp.ah_id_len =
+			cpu_to_le32(qp->ahid | (hdr_len << 24));
+		wr.wqe.cmd.mod_qp.dma_addr = cpu_to_le64(hdr_dma);
+
+		wr.wqe.cmd.mod_qp.en_pcp = attr->ah_attr.sl;
+		wr.wqe.cmd.mod_qp.ip_dscp = grh->traffic_class >> 2;
+	}
+
+	ionic_admin_post(dev, &wr);
+
+	rc = ionic_admin_wait(dev, &wr, 0);
+
+	if (mask & IB_QP_AV)
+		dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, hdr_len,
+				 DMA_TO_DEVICE);
+err_dma:
+	if (mask & IB_QP_AV)
+		kfree(hdr_buf);
+
+	return rc;
+}
+
+static int ionic_query_qp_cmd(struct ionic_ibdev *dev,
+			      struct ionic_qp *qp,
+			      struct ib_qp_attr *attr,
+			      int mask)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_QUERY_QP,
+			.len = cpu_to_le16(IONIC_ADMIN_QUERY_QP_IN_V1_LEN),
+			.cmd.query_qp = {
+				.id_ver = cpu_to_le32(qp->qpid),
+			},
+		}
+	};
+	struct ionic_v1_admin_query_qp_sq *query_sqbuf;
+	struct ionic_v1_admin_query_qp_rq *query_rqbuf;
+	dma_addr_t query_sqdma;
+	dma_addr_t query_rqdma;
+	dma_addr_t hdr_dma = 0;
+	void *hdr_buf = NULL;
+	int flags, rc;
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_QUERY_QP)
+		return -EBADRQC;
+
+	if (qp->has_sq) {
+		bool expdb = !!(qp->sq_cmb & IONIC_CMB_EXPDB);
+
+		attr->cap.max_send_sge =
+			ionic_v1_send_wqe_max_sge(qp->sq.stride_log2,
+						  qp->sq_spec,
+						  expdb);
+		attr->cap.max_inline_data =
+			ionic_v1_send_wqe_max_data(qp->sq.stride_log2, expdb);
+	}
+
+	if (qp->has_rq) {
+		attr->cap.max_recv_sge =
+			ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2,
+						  qp->rq_spec,
+						  qp->rq_cmb & IONIC_CMB_EXPDB);
+	}
+
+	query_sqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!query_sqbuf)
+		return -ENOMEM;
+
+	query_rqbuf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!query_rqbuf) {
+		rc = -ENOMEM;
+		goto err_rqbuf;
+	}
+
+	query_sqdma = dma_map_single(dev->lif_cfg.hwdev, query_sqbuf, PAGE_SIZE,
+				     DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, query_sqdma);
+	if (rc)
+		goto err_sqdma;
+
+	query_rqdma = dma_map_single(dev->lif_cfg.hwdev, query_rqbuf, PAGE_SIZE,
+				     DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, query_rqdma);
+	if (rc)
+		goto err_rqdma;
+
+	if (mask & IB_QP_AV) {
+		hdr_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!hdr_buf) {
+			rc = -ENOMEM;
+			goto err_hdrbuf;
+		}
+
+		hdr_dma = dma_map_single(dev->lif_cfg.hwdev, hdr_buf,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma);
+		if (rc)
+			goto err_hdrdma;
+	}
+
+	wr.wqe.cmd.query_qp.sq_dma_addr = cpu_to_le64(query_sqdma);
+	wr.wqe.cmd.query_qp.rq_dma_addr = cpu_to_le64(query_rqdma);
+	wr.wqe.cmd.query_qp.hdr_dma_addr = cpu_to_le64(hdr_dma);
+	wr.wqe.cmd.query_qp.ah_id = cpu_to_le32(qp->ahid);
+
+	ionic_admin_post(dev, &wr);
+
+	rc = ionic_admin_wait(dev, &wr, 0);
+
+	if (rc)
+		goto err_hdrdma;
+
+	flags = be16_to_cpu(query_sqbuf->access_perms_flags |
+			    query_rqbuf->access_perms_flags);
+
+	print_hex_dump_debug("sqbuf ", DUMP_PREFIX_OFFSET, 16, 1,
+			     query_sqbuf, sizeof(*query_sqbuf), true);
+	print_hex_dump_debug("rqbuf ", DUMP_PREFIX_OFFSET, 16, 1,
+			     query_rqbuf, sizeof(*query_rqbuf), true);
+	ibdev_dbg(&dev->ibdev, "query qp %u state_pmtu %#x flags %#x",
+		  qp->qpid, query_rqbuf->state_pmtu, flags);
+
+	attr->qp_state = from_ionic_qp_state(query_rqbuf->state_pmtu >> 4);
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = (query_rqbuf->state_pmtu & 0xf) - 7;
+	attr->path_mig_state = IB_MIG_MIGRATED;
+	attr->qkey = be32_to_cpu(query_sqbuf->qkey_dest_qpn);
+	attr->rq_psn = be32_to_cpu(query_sqbuf->rq_psn);
+	attr->sq_psn = be32_to_cpu(query_rqbuf->sq_psn);
+	attr->dest_qp_num = attr->qkey;
+	attr->qp_access_flags = from_ionic_qp_flags(flags);
+	attr->pkey_index = 0;
+	attr->alt_pkey_index = 0;
+	attr->en_sqd_async_notify = !!(flags & IONIC_QPF_SQD_NOTIFY);
+	attr->sq_draining = !!(flags & IONIC_QPF_SQ_DRAINING);
+	attr->max_rd_atomic = BIT(query_rqbuf->rrq_depth) - 1;
+	attr->max_dest_rd_atomic = BIT(query_rqbuf->rsq_depth) - 1;
+	attr->min_rnr_timer = query_sqbuf->rnr_timer;
+	attr->port_num = 0;
+	attr->timeout = query_sqbuf->retry_timeout;
+	attr->retry_cnt = query_rqbuf->retry_rnrtry & 0xf;
+	attr->rnr_retry = query_rqbuf->retry_rnrtry >> 4;
+	attr->alt_port_num = 0;
+	attr->alt_timeout = 0;
+	attr->rate_limit = be32_to_cpu(query_sqbuf->rate_limit_kbps);
+
+	if (mask & IB_QP_AV)
+		ionic_set_ah_attr(dev, &attr->ah_attr,
+				  qp->hdr, qp->sgid_index);
+
+err_hdrdma:
+	if (mask & IB_QP_AV) {
+		dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma,
+				 PAGE_SIZE, DMA_FROM_DEVICE);
+		kfree(hdr_buf);
+	}
+err_hdrbuf:
+	dma_unmap_single(dev->lif_cfg.hwdev, query_rqdma, sizeof(*query_rqbuf),
+			 DMA_FROM_DEVICE);
+err_rqdma:
+	dma_unmap_single(dev->lif_cfg.hwdev, query_sqdma, sizeof(*query_sqbuf),
+			 DMA_FROM_DEVICE);
+err_sqdma:
+	kfree(query_rqbuf);
+err_rqbuf:
+	kfree(query_sqbuf);
+
+	return rc;
+}
+
+static int ionic_destroy_qp_cmd(struct ionic_ibdev *dev, u32 qpid)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = IONIC_V1_ADMIN_DESTROY_QP,
+			.len = cpu_to_le16(IONIC_ADMIN_DESTROY_QP_IN_V1_LEN),
+			.cmd.destroy_qp = {
+				.qp_id = cpu_to_le32(qpid),
+			},
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= IONIC_V1_ADMIN_DESTROY_QP)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_TEARDOWN);
+}
+
+static bool ionic_expdb_wqe_size_supported(struct ionic_ibdev *dev,
+					   uint32_t wqe_size)
+{
+	switch (wqe_size) {
+	case 64: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_64;
+	case 128: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_128;
+	case 256: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_256;
+	case 512: return dev->lif_cfg.expdb_mask & IONIC_EXPDB_512;
+	}
+
+	return false;
+}
+
+static void ionic_qp_sq_init_cmb(struct ionic_ibdev *dev,
+				 struct ionic_qp *qp,
+				 struct ib_udata *udata,
+				 int max_data)
+{
+	u8 expdb_stride_log2 = 0;
+	bool expdb;
+	int rc;
+
+	if (!(qp->sq_cmb & IONIC_CMB_ENABLE))
+		goto not_in_cmb;
+
+	if (qp->sq_cmb & ~IONIC_CMB_SUPPORTED) {
+		if (qp->sq_cmb & IONIC_CMB_REQUIRE)
+			goto not_in_cmb;
+
+		qp->sq_cmb &= IONIC_CMB_SUPPORTED;
+	}
+
+	if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.sq_expdb) {
+		if (qp->sq_cmb & IONIC_CMB_REQUIRE)
+			goto not_in_cmb;
+
+		qp->sq_cmb &= ~IONIC_CMB_EXPDB;
+	}
+
+	qp->sq_cmb_order = order_base_2(qp->sq.size / PAGE_SIZE);
+
+	if (qp->sq_cmb_order >= IONIC_SQCMB_ORDER)
+		goto not_in_cmb;
+
+	if (qp->sq_cmb & IONIC_CMB_EXPDB)
+		expdb_stride_log2 = qp->sq.stride_log2;
+
+	rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->sq_cmb_pgid,
+			   &qp->sq_cmb_addr, qp->sq_cmb_order,
+			   expdb_stride_log2, &expdb);
+	if (rc)
+		goto not_in_cmb;
+
+	if ((qp->sq_cmb & IONIC_CMB_EXPDB) && !expdb) {
+		if (qp->sq_cmb & IONIC_CMB_REQUIRE)
+			goto err_map;
+
+		qp->sq_cmb &= ~IONIC_CMB_EXPDB;
+	}
+
+	return;
+
+err_map:
+	ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order);
+not_in_cmb:
+	if (qp->sq_cmb & IONIC_CMB_REQUIRE)
+		ibdev_dbg(&dev->ibdev, "could not place sq in cmb as required\n");
+
+	qp->sq_cmb = 0;
+	qp->sq_cmb_order = IONIC_RES_INVALID;
+	qp->sq_cmb_pgid = 0;
+	qp->sq_cmb_addr = 0;
+}
+
+static void ionic_qp_sq_destroy_cmb(struct ionic_ibdev *dev,
+				    struct ionic_ctx *ctx,
+				    struct ionic_qp *qp)
+{
+	if (!(qp->sq_cmb & IONIC_CMB_ENABLE))
+		return;
+
+	if (ctx)
+		rdma_user_mmap_entry_remove(qp->mmap_sq_cmb);
+
+	ionic_put_cmb(dev->lif_cfg.lif, qp->sq_cmb_pgid, qp->sq_cmb_order);
+}
+
+static int ionic_qp_sq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx,
+			    struct ionic_qp *qp, struct ionic_qdesc *sq,
+			    struct ionic_tbl_buf *buf, int max_wr, int max_sge,
+			    int max_data, int sq_spec, struct ib_udata *udata)
+{
+	u32 wqe_size;
+	int rc = 0;
+
+	qp->sq_msn_prod = 0;
+	qp->sq_msn_cons = 0;
+
+	if (!qp->has_sq) {
+		if (buf) {
+			buf->tbl_buf = NULL;
+			buf->tbl_limit = 0;
+			buf->tbl_pages = 0;
+		}
+		if (udata)
+			rc = ionic_validate_qdesc_zero(sq);
+
+		return rc;
+	}
+
+	rc = -EINVAL;
+
+	if (max_wr < 0 || max_wr > 0xffff)
+		return rc;
+
+	if (max_sge < 1)
+		return rc;
+
+	if (max_sge > min(ionic_v1_send_wqe_max_sge(dev->lif_cfg.max_stride, 0,
+						    qp->sq_cmb &
+						    IONIC_CMB_EXPDB),
+			  IONIC_SPEC_HIGH))
+		return rc;
+
+	if (max_data < 0)
+		return rc;
+
+	if (max_data > ionic_v1_send_wqe_max_data(dev->lif_cfg.max_stride,
+						  qp->sq_cmb & IONIC_CMB_EXPDB))
+		return rc;
+
+	if (udata) {
+		rc = ionic_validate_qdesc(sq);
+		if (rc)
+			return rc;
+
+		qp->sq_spec = sq_spec;
+
+		qp->sq.ptr = NULL;
+		qp->sq.size = sq->size;
+		qp->sq.mask = sq->mask;
+		qp->sq.depth_log2 = sq->depth_log2;
+		qp->sq.stride_log2 = sq->stride_log2;
+
+		qp->sq_meta = NULL;
+		qp->sq_msn_idx = NULL;
+
+		qp->sq_umem = ib_umem_get(&dev->ibdev, sq->addr, sq->size, 0);
+		if (IS_ERR(qp->sq_umem))
+			return PTR_ERR(qp->sq_umem);
+	} else {
+		qp->sq_umem = NULL;
+
+		qp->sq_spec = ionic_v1_use_spec_sge(max_sge, sq_spec);
+		if (sq_spec && !qp->sq_spec)
+			ibdev_dbg(&dev->ibdev,
+				  "init sq: max_sge %u disables spec\n",
+				  max_sge);
+
+		if (qp->sq_cmb & IONIC_CMB_EXPDB) {
+			wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data,
+							      qp->sq_spec,
+							      true);
+
+			if (!ionic_expdb_wqe_size_supported(dev, wqe_size))
+				qp->sq_cmb &= ~IONIC_CMB_EXPDB;
+		}
+
+		if (!(qp->sq_cmb & IONIC_CMB_EXPDB))
+			wqe_size = ionic_v1_send_wqe_min_size(max_sge, max_data,
+							      qp->sq_spec,
+							      false);
+
+		rc = ionic_queue_init(&qp->sq, dev->lif_cfg.hwdev,
+				      max_wr, wqe_size);
+		if (rc)
+			return rc;
+
+		ionic_queue_dbell_init(&qp->sq, qp->qpid);
+
+		qp->sq_meta = kmalloc_array((u32)qp->sq.mask + 1,
+					    sizeof(*qp->sq_meta),
+					    GFP_KERNEL);
+		if (!qp->sq_meta) {
+			rc = -ENOMEM;
+			goto err_sq_meta;
+		}
+
+		qp->sq_msn_idx = kmalloc_array((u32)qp->sq.mask + 1,
+					       sizeof(*qp->sq_msn_idx),
+					       GFP_KERNEL);
+		if (!qp->sq_msn_idx) {
+			rc = -ENOMEM;
+			goto err_sq_msn;
+		}
+	}
+
+	ionic_qp_sq_init_cmb(dev, qp, udata, max_data);
+
+	if (qp->sq_cmb & IONIC_CMB_ENABLE)
+		rc = ionic_pgtbl_init(dev, buf, NULL,
+				      (u64)qp->sq_cmb_pgid << PAGE_SHIFT,
+				      1, PAGE_SIZE);
+	else
+		rc = ionic_pgtbl_init(dev, buf,
+				      qp->sq_umem, qp->sq.dma, 1, PAGE_SIZE);
+	if (rc)
+		goto err_sq_tbl;
+
+	return 0;
+
+err_sq_tbl:
+	ionic_qp_sq_destroy_cmb(dev, ctx, qp);
+	kfree(qp->sq_msn_idx);
+err_sq_msn:
+	kfree(qp->sq_meta);
+err_sq_meta:
+	if (qp->sq_umem)
+		ib_umem_release(qp->sq_umem);
+	else
+		ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev);
+	return rc;
+}
+
+static void ionic_qp_sq_destroy(struct ionic_ibdev *dev,
+				struct ionic_ctx *ctx,
+				struct ionic_qp *qp)
+{
+	if (!qp->has_sq)
+		return;
+
+	ionic_qp_sq_destroy_cmb(dev, ctx, qp);
+
+	kfree(qp->sq_msn_idx);
+	kfree(qp->sq_meta);
+
+	if (qp->sq_umem)
+		ib_umem_release(qp->sq_umem);
+	else
+		ionic_queue_destroy(&qp->sq, dev->lif_cfg.hwdev);
+}
+
+static void ionic_qp_rq_init_cmb(struct ionic_ibdev *dev,
+				 struct ionic_qp *qp,
+				 struct ib_udata *udata)
+{
+	u8 expdb_stride_log2 = 0;
+	bool expdb;
+	int rc;
+
+	if (!(qp->rq_cmb & IONIC_CMB_ENABLE))
+		goto not_in_cmb;
+
+	if (qp->rq_cmb & ~IONIC_CMB_SUPPORTED) {
+		if (qp->rq_cmb & IONIC_CMB_REQUIRE)
+			goto not_in_cmb;
+
+		qp->rq_cmb &= IONIC_CMB_SUPPORTED;
+	}
+
+	if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !dev->lif_cfg.rq_expdb) {
+		if (qp->rq_cmb & IONIC_CMB_REQUIRE)
+			goto not_in_cmb;
+
+		qp->rq_cmb &= ~IONIC_CMB_EXPDB;
+	}
+
+	qp->rq_cmb_order = order_base_2(qp->rq.size / PAGE_SIZE);
+
+	if (qp->rq_cmb_order >= IONIC_RQCMB_ORDER)
+		goto not_in_cmb;
+
+	if (qp->rq_cmb & IONIC_CMB_EXPDB)
+		expdb_stride_log2 = qp->rq.stride_log2;
+
+	rc = ionic_get_cmb(dev->lif_cfg.lif, &qp->rq_cmb_pgid,
+			   &qp->rq_cmb_addr, qp->rq_cmb_order,
+			   expdb_stride_log2, &expdb);
+	if (rc)
+		goto not_in_cmb;
+
+	if ((qp->rq_cmb & IONIC_CMB_EXPDB) && !expdb) {
+		if (qp->rq_cmb & IONIC_CMB_REQUIRE)
+			goto err_map;
+
+		qp->rq_cmb &= ~IONIC_CMB_EXPDB;
+	}
+
+	return;
+
+err_map:
+	ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order);
+not_in_cmb:
+	if (qp->rq_cmb & IONIC_CMB_REQUIRE)
+		ibdev_dbg(&dev->ibdev, "could not place rq in cmb as required\n");
+
+	qp->rq_cmb = 0;
+	qp->rq_cmb_order = IONIC_RES_INVALID;
+	qp->rq_cmb_pgid = 0;
+	qp->rq_cmb_addr = 0;
+}
+
+static void ionic_qp_rq_destroy_cmb(struct ionic_ibdev *dev,
+				    struct ionic_ctx *ctx,
+				    struct ionic_qp *qp)
+{
+	if (!(qp->rq_cmb & IONIC_CMB_ENABLE))
+		return;
+
+	if (ctx)
+		rdma_user_mmap_entry_remove(qp->mmap_rq_cmb);
+
+	ionic_put_cmb(dev->lif_cfg.lif, qp->rq_cmb_pgid, qp->rq_cmb_order);
+}
+
+static int ionic_qp_rq_init(struct ionic_ibdev *dev, struct ionic_ctx *ctx,
+			    struct ionic_qp *qp, struct ionic_qdesc *rq,
+			    struct ionic_tbl_buf *buf, int max_wr, int max_sge,
+			    int rq_spec, struct ib_udata *udata)
+{
+	int rc = 0, i;
+	u32 wqe_size;
+
+	if (!qp->has_rq) {
+		if (buf) {
+			buf->tbl_buf = NULL;
+			buf->tbl_limit = 0;
+			buf->tbl_pages = 0;
+		}
+		if (udata)
+			rc = ionic_validate_qdesc_zero(rq);
+
+		return rc;
+	}
+
+	rc = -EINVAL;
+
+	if (max_wr < 0 || max_wr > 0xffff)
+		return rc;
+
+	if (max_sge < 1)
+		return rc;
+
+	if (max_sge > min(ionic_v1_recv_wqe_max_sge(dev->lif_cfg.max_stride, 0, false),
+			  IONIC_SPEC_HIGH))
+		return rc;
+
+	if (udata) {
+		rc = ionic_validate_qdesc(rq);
+		if (rc)
+			return rc;
+
+		qp->rq_spec = rq_spec;
+
+		qp->rq.ptr = NULL;
+		qp->rq.size = rq->size;
+		qp->rq.mask = rq->mask;
+		qp->rq.depth_log2 = rq->depth_log2;
+		qp->rq.stride_log2 = rq->stride_log2;
+
+		qp->rq_meta = NULL;
+
+		qp->rq_umem = ib_umem_get(&dev->ibdev, rq->addr, rq->size, 0);
+		if (IS_ERR(qp->rq_umem))
+			return PTR_ERR(qp->rq_umem);
+	} else {
+		qp->rq_umem = NULL;
+
+		qp->rq_spec = ionic_v1_use_spec_sge(max_sge, rq_spec);
+		if (rq_spec && !qp->rq_spec)
+			ibdev_dbg(&dev->ibdev,
+				  "init rq: max_sge %u disables spec\n",
+				  max_sge);
+
+		if (qp->rq_cmb & IONIC_CMB_EXPDB) {
+			wqe_size = ionic_v1_recv_wqe_min_size(max_sge,
+							      qp->rq_spec,
+							      true);
+
+			if (!ionic_expdb_wqe_size_supported(dev, wqe_size))
+				qp->rq_cmb &= ~IONIC_CMB_EXPDB;
+		}
+
+		if (!(qp->rq_cmb & IONIC_CMB_EXPDB))
+			wqe_size = ionic_v1_recv_wqe_min_size(max_sge,
+							      qp->rq_spec,
+							      false);
+
+		rc = ionic_queue_init(&qp->rq, dev->lif_cfg.hwdev,
+				      max_wr, wqe_size);
+		if (rc)
+			return rc;
+
+		ionic_queue_dbell_init(&qp->rq, qp->qpid);
+
+		qp->rq_meta = kmalloc_array((u32)qp->rq.mask + 1,
+					    sizeof(*qp->rq_meta),
+					    GFP_KERNEL);
+		if (!qp->rq_meta) {
+			rc = -ENOMEM;
+			goto err_rq_meta;
+		}
+
+		for (i = 0; i < qp->rq.mask; ++i)
+			qp->rq_meta[i].next = &qp->rq_meta[i + 1];
+		qp->rq_meta[i].next = IONIC_META_LAST;
+		qp->rq_meta_head = &qp->rq_meta[0];
+	}
+
+	ionic_qp_rq_init_cmb(dev, qp, udata);
+
+	if (qp->rq_cmb & IONIC_CMB_ENABLE)
+		rc = ionic_pgtbl_init(dev, buf, NULL,
+				      (u64)qp->rq_cmb_pgid << PAGE_SHIFT,
+				      1, PAGE_SIZE);
+	else
+		rc = ionic_pgtbl_init(dev, buf,
+				      qp->rq_umem, qp->rq.dma, 1, PAGE_SIZE);
+	if (rc)
+		goto err_rq_tbl;
+
+	return 0;
+
+err_rq_tbl:
+	ionic_qp_rq_destroy_cmb(dev, ctx, qp);
+	kfree(qp->rq_meta);
+err_rq_meta:
+	if (qp->rq_umem)
+		ib_umem_release(qp->rq_umem);
+	else
+		ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev);
+	return rc;
+}
+
+static void ionic_qp_rq_destroy(struct ionic_ibdev *dev,
+				struct ionic_ctx *ctx,
+				struct ionic_qp *qp)
+{
+	if (!qp->has_rq)
+		return;
+
+	ionic_qp_rq_destroy_cmb(dev, ctx, qp);
+
+	kfree(qp->rq_meta);
+
+	if (qp->rq_umem)
+		ib_umem_release(qp->rq_umem);
+	else
+		ionic_queue_destroy(&qp->rq, dev->lif_cfg.hwdev);
+}
+
+int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
+		    struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_tbl_buf sq_buf = {}, rq_buf = {};
+	struct ionic_pd *pd = to_ionic_pd(ibqp->pd);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	struct ionic_ctx *ctx =
+		rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx);
+	struct ionic_qp_resp resp = {};
+	struct ionic_qp_req req = {};
+	struct ionic_cq *cq;
+	u8 udma_mask;
+	void *entry;
+	int rc;
+
+	if (udata) {
+		rc = ib_copy_from_udata(&req, udata, sizeof(req));
+		if (rc)
+			return rc;
+	} else {
+		req.sq_spec = IONIC_SPEC_HIGH;
+		req.rq_spec = IONIC_SPEC_HIGH;
+	}
+
+	if (attr->qp_type == IB_QPT_SMI || attr->qp_type > IB_QPT_UD)
+		return -EOPNOTSUPP;
+
+	qp->state = IB_QPS_RESET;
+
+	INIT_LIST_HEAD(&qp->cq_poll_sq);
+	INIT_LIST_HEAD(&qp->cq_flush_sq);
+	INIT_LIST_HEAD(&qp->cq_flush_rq);
+
+	spin_lock_init(&qp->sq_lock);
+	spin_lock_init(&qp->rq_lock);
+
+	qp->has_sq = 1;
+	qp->has_rq = 1;
+
+	if (attr->qp_type == IB_QPT_GSI) {
+		rc = ionic_get_gsi_qpid(dev, &qp->qpid);
+	} else {
+		udma_mask = BIT(dev->lif_cfg.udma_count) - 1;
+
+		if (qp->has_sq)
+			udma_mask &= to_ionic_vcq(attr->send_cq)->udma_mask;
+
+		if (qp->has_rq)
+			udma_mask &= to_ionic_vcq(attr->recv_cq)->udma_mask;
+
+		if (udata && req.udma_mask)
+			udma_mask &= req.udma_mask;
+
+		if (!udma_mask)
+			return -EINVAL;
+
+		rc = ionic_get_qpid(dev, &qp->qpid, &qp->udma_idx, udma_mask);
+	}
+	if (rc)
+		return rc;
+
+	qp->sig_all = attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+	qp->has_ah = attr->qp_type == IB_QPT_RC;
+
+	if (qp->has_ah) {
+		qp->hdr = kzalloc(sizeof(*qp->hdr), GFP_KERNEL);
+		if (!qp->hdr) {
+			rc = -ENOMEM;
+			goto err_ah_alloc;
+		}
+
+		rc = ionic_get_ahid(dev, &qp->ahid);
+		if (rc)
+			goto err_ahid;
+	}
+
+	if (udata) {
+		if (req.rq_cmb & IONIC_CMB_ENABLE)
+			qp->rq_cmb = req.rq_cmb;
+
+		if (req.sq_cmb & IONIC_CMB_ENABLE)
+			qp->sq_cmb = req.sq_cmb;
+	}
+
+	rc = ionic_qp_sq_init(dev, ctx, qp, &req.sq, &sq_buf,
+			      attr->cap.max_send_wr, attr->cap.max_send_sge,
+			      attr->cap.max_inline_data, req.sq_spec, udata);
+	if (rc)
+		goto err_sq;
+
+	rc = ionic_qp_rq_init(dev, ctx, qp, &req.rq, &rq_buf,
+			      attr->cap.max_recv_wr, attr->cap.max_recv_sge,
+			      req.rq_spec, udata);
+	if (rc)
+		goto err_rq;
+
+	rc = ionic_create_qp_cmd(dev, pd,
+				 to_ionic_vcq_cq(attr->send_cq, qp->udma_idx),
+				 to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx),
+				 qp, &sq_buf, &rq_buf, attr);
+	if (rc)
+		goto err_cmd;
+
+	if (udata) {
+		resp.qpid = qp->qpid;
+		resp.udma_idx = qp->udma_idx;
+
+		if (qp->sq_cmb & IONIC_CMB_ENABLE) {
+			bool wc;
+
+			if ((qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) ==
+				(IONIC_CMB_WC | IONIC_CMB_UC)) {
+				ibdev_dbg(&dev->ibdev,
+					  "Both sq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n");
+				qp->sq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC);
+			}
+
+			wc = (qp->sq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC))
+					!= IONIC_CMB_UC;
+
+			/* let userspace know the mapping */
+			if (wc)
+				qp->sq_cmb |= IONIC_CMB_WC;
+			else
+				qp->sq_cmb |= IONIC_CMB_UC;
+
+			qp->mmap_sq_cmb =
+			    ionic_mmap_entry_insert(ctx,
+						    qp->sq.size,
+						    PHYS_PFN(qp->sq_cmb_addr),
+						    wc ? IONIC_MMAP_WC : 0,
+						    &resp.sq_cmb_offset);
+			if (!qp->mmap_sq_cmb) {
+				rc = -ENOMEM;
+				goto err_mmap_sq;
+			}
+
+			resp.sq_cmb = qp->sq_cmb;
+		}
+
+		if (qp->rq_cmb & IONIC_CMB_ENABLE) {
+			bool wc;
+
+			if ((qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC)) ==
+				(IONIC_CMB_WC | IONIC_CMB_UC)) {
+				ibdev_dbg(&dev->ibdev,
+					  "Both rq_cmb flags IONIC_CMB_WC and IONIC_CMB_UC are set, using default driver mapping\n");
+				qp->rq_cmb &= ~(IONIC_CMB_WC | IONIC_CMB_UC);
+			}
+
+			if (qp->rq_cmb & IONIC_CMB_EXPDB)
+				wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC))
+					== IONIC_CMB_WC;
+			else
+				wc = (qp->rq_cmb & (IONIC_CMB_WC | IONIC_CMB_UC))
+					!= IONIC_CMB_UC;
+
+			/* let userspace know the mapping */
+			if (wc)
+				qp->rq_cmb |= IONIC_CMB_WC;
+			else
+				qp->rq_cmb |= IONIC_CMB_UC;
+
+			qp->mmap_rq_cmb =
+			    ionic_mmap_entry_insert(ctx,
+						    qp->rq.size,
+						    PHYS_PFN(qp->rq_cmb_addr),
+						    wc ? IONIC_MMAP_WC : 0,
+						    &resp.rq_cmb_offset);
+			if (!qp->mmap_rq_cmb) {
+				rc = -ENOMEM;
+				goto err_mmap_rq;
+			}
+
+			resp.rq_cmb = qp->rq_cmb;
+		}
+
+		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		if (rc)
+			goto err_resp;
+	}
+
+	ionic_pgtbl_unbuf(dev, &rq_buf);
+	ionic_pgtbl_unbuf(dev, &sq_buf);
+
+	qp->ibqp.qp_num = qp->qpid;
+
+	init_completion(&qp->qp_rel_comp);
+	kref_init(&qp->qp_kref);
+
+	entry = xa_store_irq(&dev->qp_tbl, qp->qpid, qp, GFP_KERNEL);
+	if (entry) {
+		if (!xa_is_err(entry))
+			rc = -EINVAL;
+		else
+			rc = xa_err(entry);
+
+		goto err_resp;
+	}
+
+	if (qp->has_sq) {
+		cq = to_ionic_vcq_cq(attr->send_cq, qp->udma_idx);
+
+		attr->cap.max_send_wr = qp->sq.mask;
+		attr->cap.max_send_sge =
+			ionic_v1_send_wqe_max_sge(qp->sq.stride_log2,
+						  qp->sq_spec,
+						  qp->sq_cmb & IONIC_CMB_EXPDB);
+		attr->cap.max_inline_data =
+			ionic_v1_send_wqe_max_data(qp->sq.stride_log2,
+						   qp->sq_cmb &
+						   IONIC_CMB_EXPDB);
+		qp->sq_cqid = cq->cqid;
+	}
+
+	if (qp->has_rq) {
+		cq = to_ionic_vcq_cq(attr->recv_cq, qp->udma_idx);
+
+		attr->cap.max_recv_wr = qp->rq.mask;
+		attr->cap.max_recv_sge =
+			ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2,
+						  qp->rq_spec,
+						  qp->rq_cmb & IONIC_CMB_EXPDB);
+		qp->rq_cqid = cq->cqid;
+	}
+
+	return 0;
+
+err_resp:
+	if (udata && (qp->rq_cmb & IONIC_CMB_ENABLE))
+		rdma_user_mmap_entry_remove(qp->mmap_rq_cmb);
+err_mmap_rq:
+	if (udata && (qp->sq_cmb & IONIC_CMB_ENABLE))
+		rdma_user_mmap_entry_remove(qp->mmap_sq_cmb);
+err_mmap_sq:
+	ionic_destroy_qp_cmd(dev, qp->qpid);
+err_cmd:
+	ionic_pgtbl_unbuf(dev, &rq_buf);
+	ionic_qp_rq_destroy(dev, ctx, qp);
+err_rq:
+	ionic_pgtbl_unbuf(dev, &sq_buf);
+	ionic_qp_sq_destroy(dev, ctx, qp);
+err_sq:
+	if (qp->has_ah)
+		ionic_put_ahid(dev, qp->ahid);
+err_ahid:
+	kfree(qp->hdr);
+err_ah_alloc:
+	ionic_put_qpid(dev, qp->qpid);
+	return rc;
+}
+
+void ionic_notify_flush_cq(struct ionic_cq *cq)
+{
+	if (cq->flush && cq->vcq->ibcq.comp_handler)
+		cq->vcq->ibcq.comp_handler(&cq->vcq->ibcq,
+					   cq->vcq->ibcq.cq_context);
+}
+
+static void ionic_notify_qp_cqs(struct ionic_ibdev *dev, struct ionic_qp *qp)
+{
+	if (qp->ibqp.send_cq)
+		ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.send_cq,
+						      qp->udma_idx));
+	if (qp->ibqp.recv_cq && qp->ibqp.recv_cq != qp->ibqp.send_cq)
+		ionic_notify_flush_cq(to_ionic_vcq_cq(qp->ibqp.recv_cq,
+						      qp->udma_idx));
+}
+
+void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp)
+{
+	unsigned long irqflags;
+	struct ionic_cq *cq;
+
+	if (qp->ibqp.send_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx);
+
+		/* Hold the CQ lock and QP sq_lock to set up flush */
+		spin_lock_irqsave(&cq->lock, irqflags);
+		spin_lock(&qp->sq_lock);
+		qp->sq_flush = true;
+		if (!ionic_queue_empty(&qp->sq)) {
+			cq->flush = true;
+			list_move_tail(&qp->cq_flush_sq, &cq->flush_sq);
+		}
+		spin_unlock(&qp->sq_lock);
+		spin_unlock_irqrestore(&cq->lock, irqflags);
+	}
+
+	if (qp->ibqp.recv_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx);
+
+		/* Hold the CQ lock and QP rq_lock to set up flush */
+		spin_lock_irqsave(&cq->lock, irqflags);
+		spin_lock(&qp->rq_lock);
+		qp->rq_flush = true;
+		if (!ionic_queue_empty(&qp->rq)) {
+			cq->flush = true;
+			list_move_tail(&qp->cq_flush_rq, &cq->flush_rq);
+		}
+		spin_unlock(&qp->rq_lock);
+		spin_unlock_irqrestore(&cq->lock, irqflags);
+	}
+}
+
+static void ionic_clean_cq(struct ionic_cq *cq, u32 qpid)
+{
+	struct ionic_v1_cqe *qcqe;
+	int prod, qtf, qid, type;
+	bool color;
+
+	if (!cq->q.ptr)
+		return;
+
+	color = cq->color;
+	prod = cq->q.prod;
+	qcqe = ionic_queue_at(&cq->q, prod);
+
+	while (color == ionic_v1_cqe_color(qcqe)) {
+		qtf = ionic_v1_cqe_qtf(qcqe);
+		qid = ionic_v1_cqe_qtf_qid(qtf);
+		type = ionic_v1_cqe_qtf_type(qtf);
+
+		if (qid == qpid && type != IONIC_V1_CQE_TYPE_ADMIN)
+			ionic_v1_cqe_clean(qcqe);
+
+		prod = ionic_queue_next(&cq->q, prod);
+		qcqe = ionic_queue_at(&cq->q, prod);
+		color = ionic_color_wrap(prod, color);
+	}
+}
+
+static void ionic_reset_qp(struct ionic_ibdev *dev, struct ionic_qp *qp)
+{
+	unsigned long irqflags;
+	struct ionic_cq *cq;
+	int i;
+
+	local_irq_save(irqflags);
+
+	if (qp->ibqp.send_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx);
+		spin_lock(&cq->lock);
+		ionic_clean_cq(cq, qp->qpid);
+		spin_unlock(&cq->lock);
+	}
+
+	if (qp->ibqp.recv_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx);
+		spin_lock(&cq->lock);
+		ionic_clean_cq(cq, qp->qpid);
+		spin_unlock(&cq->lock);
+	}
+
+	if (qp->has_sq) {
+		spin_lock(&qp->sq_lock);
+		qp->sq_flush = false;
+		qp->sq_flush_rcvd = false;
+		qp->sq_msn_prod = 0;
+		qp->sq_msn_cons = 0;
+		qp->sq.prod = 0;
+		qp->sq.cons = 0;
+		spin_unlock(&qp->sq_lock);
+	}
+
+	if (qp->has_rq) {
+		spin_lock(&qp->rq_lock);
+		qp->rq_flush = false;
+		qp->rq.prod = 0;
+		qp->rq.cons = 0;
+		if (qp->rq_meta) {
+			for (i = 0; i < qp->rq.mask; ++i)
+				qp->rq_meta[i].next = &qp->rq_meta[i + 1];
+			qp->rq_meta[i].next = IONIC_META_LAST;
+		}
+		qp->rq_meta_head = &qp->rq_meta[0];
+		spin_unlock(&qp->rq_lock);
+	}
+
+	local_irq_restore(irqflags);
+}
+
+static bool ionic_qp_cur_state_is_ok(enum ib_qp_state q_state,
+				     enum ib_qp_state attr_state)
+{
+	if (q_state == attr_state)
+		return true;
+
+	if (attr_state == IB_QPS_ERR)
+		return true;
+
+	if (attr_state == IB_QPS_SQE)
+		return q_state == IB_QPS_RTS || q_state == IB_QPS_SQD;
+
+	return false;
+}
+
+static int ionic_check_modify_qp(struct ionic_qp *qp, struct ib_qp_attr *attr,
+				 int mask)
+{
+	enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ?
+		attr->cur_qp_state : qp->state;
+	enum ib_qp_state next_state = (mask & IB_QP_STATE) ?
+		attr->qp_state : cur_state;
+
+	if ((mask & IB_QP_CUR_STATE) &&
+	    !ionic_qp_cur_state_is_ok(qp->state, attr->cur_qp_state))
+		return -EINVAL;
+
+	if (!ib_modify_qp_is_ok(cur_state, next_state, qp->ibqp.qp_type, mask))
+		return -EINVAL;
+
+	/* unprivileged qp not allowed privileged qkey */
+	if ((mask & IB_QP_QKEY) && (attr->qkey & 0x80000000) &&
+	    qp->ibqp.uobject)
+		return -EPERM;
+
+	return 0;
+}
+
+int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
+		    struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_pd *pd = to_ionic_pd(ibqp->pd);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	int rc;
+
+	rc = ionic_check_modify_qp(qp, attr, mask);
+	if (rc)
+		return rc;
+
+	if (mask & IB_QP_CAP)
+		return -EINVAL;
+
+	rc = ionic_modify_qp_cmd(dev, pd, qp, attr, mask);
+	if (rc)
+		return rc;
+
+	if (mask & IB_QP_STATE) {
+		qp->state = attr->qp_state;
+
+		if (attr->qp_state == IB_QPS_ERR) {
+			ionic_flush_qp(dev, qp);
+			ionic_notify_qp_cqs(dev, qp);
+		} else if (attr->qp_state == IB_QPS_RESET) {
+			ionic_reset_qp(dev, qp);
+		}
+	}
+
+	return 0;
+}
+
+int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int mask, struct ib_qp_init_attr *init_attr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	int rc;
+
+	memset(attr, 0, sizeof(*attr));
+	memset(init_attr, 0, sizeof(*init_attr));
+
+	rc = ionic_query_qp_cmd(dev, qp, attr, mask);
+	if (rc)
+		return rc;
+
+	if (qp->has_sq)
+		attr->cap.max_send_wr = qp->sq.mask;
+
+	if (qp->has_rq)
+		attr->cap.max_recv_wr = qp->rq.mask;
+
+	init_attr->event_handler = ibqp->event_handler;
+	init_attr->qp_context = ibqp->qp_context;
+	init_attr->send_cq = ibqp->send_cq;
+	init_attr->recv_cq = ibqp->recv_cq;
+	init_attr->srq = ibqp->srq;
+	init_attr->xrcd = ibqp->xrcd;
+	init_attr->cap = attr->cap;
+	init_attr->sq_sig_type = qp->sig_all ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	init_attr->qp_type = ibqp->qp_type;
+	init_attr->create_flags = 0;
+	init_attr->port_num = 0;
+	init_attr->rwq_ind_tbl = ibqp->rwq_ind_tbl;
+	init_attr->source_qpn = 0;
+
+	return rc;
+}
+
+int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+	struct ionic_ctx *ctx =
+		rdma_udata_to_drv_context(udata, struct ionic_ctx, ibctx);
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	unsigned long irqflags;
+	struct ionic_cq *cq;
+	int rc;
+
+	rc = ionic_destroy_qp_cmd(dev, qp->qpid);
+	if (rc)
+		return rc;
+
+	xa_erase_irq(&dev->qp_tbl, qp->qpid);
+
+	kref_put(&qp->qp_kref, ionic_qp_complete);
+	wait_for_completion(&qp->qp_rel_comp);
+
+	if (qp->ibqp.send_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.send_cq, qp->udma_idx);
+		spin_lock_irqsave(&cq->lock, irqflags);
+		ionic_clean_cq(cq, qp->qpid);
+		list_del(&qp->cq_poll_sq);
+		list_del(&qp->cq_flush_sq);
+		spin_unlock_irqrestore(&cq->lock, irqflags);
+	}
+
+	if (qp->ibqp.recv_cq) {
+		cq = to_ionic_vcq_cq(qp->ibqp.recv_cq, qp->udma_idx);
+		spin_lock_irqsave(&cq->lock, irqflags);
+		ionic_clean_cq(cq, qp->qpid);
+		list_del(&qp->cq_flush_rq);
+		spin_unlock_irqrestore(&cq->lock, irqflags);
+	}
+
+	ionic_qp_rq_destroy(dev, ctx, qp);
+	ionic_qp_sq_destroy(dev, ctx, qp);
+	if (qp->has_ah) {
+		ionic_put_ahid(dev, qp->ahid);
+		kfree(qp->hdr);
+	}
+	ionic_put_qpid(dev, qp->qpid);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_datapath.c b/drivers/infiniband/hw/ionic/ionic_datapath.c
new file mode 100644
index 000000000000..aa2944887f23
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_datapath.c
@@ -0,0 +1,1399 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ionic_fw.h"
+#include "ionic_ibdev.h"
+
+#define IONIC_OP(version, opname) \
+	((version) < 2 ? IONIC_V1_OP_##opname : IONIC_V2_OP_##opname)
+
+static bool ionic_next_cqe(struct ionic_ibdev *dev, struct ionic_cq *cq,
+			   struct ionic_v1_cqe **cqe)
+{
+	struct ionic_v1_cqe *qcqe = ionic_queue_at_prod(&cq->q);
+
+	if (unlikely(cq->color != ionic_v1_cqe_color(qcqe)))
+		return false;
+
+	/* Prevent out-of-order reads of the CQE */
+	dma_rmb();
+
+	*cqe = qcqe;
+
+	return true;
+}
+
+static int ionic_flush_recv(struct ionic_qp *qp, struct ib_wc *wc)
+{
+	struct ionic_rq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+
+	if (!qp->rq_flush)
+		return 0;
+
+	if (ionic_queue_empty(&qp->rq))
+		return 0;
+
+	wqe = ionic_queue_at_cons(&qp->rq);
+
+	/* wqe_id must be a valid queue index */
+	if (unlikely(wqe->base.wqe_id >> qp->rq.depth_log2)) {
+		ibdev_warn(qp->ibqp.device,
+			   "flush qp %u recv index %llu invalid\n",
+			   qp->qpid, (unsigned long long)wqe->base.wqe_id);
+		return -EIO;
+	}
+
+	/* wqe_id must indicate a request that is outstanding */
+	meta = &qp->rq_meta[wqe->base.wqe_id];
+	if (unlikely(meta->next != IONIC_META_POSTED)) {
+		ibdev_warn(qp->ibqp.device,
+			   "flush qp %u recv index %llu not posted\n",
+			   qp->qpid, (unsigned long long)wqe->base.wqe_id);
+		return -EIO;
+	}
+
+	ionic_queue_consume(&qp->rq);
+
+	memset(wc, 0, sizeof(*wc));
+
+	wc->status = IB_WC_WR_FLUSH_ERR;
+	wc->wr_id = meta->wrid;
+	wc->qp = &qp->ibqp;
+
+	meta->next = qp->rq_meta_head;
+	qp->rq_meta_head = meta;
+
+	return 1;
+}
+
+static int ionic_flush_recv_many(struct ionic_qp *qp,
+				 struct ib_wc *wc, int nwc)
+{
+	int rc = 0, npolled = 0;
+
+	while (npolled < nwc) {
+		rc = ionic_flush_recv(qp, wc + npolled);
+		if (rc <= 0)
+			break;
+
+		npolled += rc;
+	}
+
+	return npolled ?: rc;
+}
+
+static int ionic_flush_send(struct ionic_qp *qp, struct ib_wc *wc)
+{
+	struct ionic_sq_meta *meta;
+
+	if (!qp->sq_flush)
+		return 0;
+
+	if (ionic_queue_empty(&qp->sq))
+		return 0;
+
+	meta = &qp->sq_meta[qp->sq.cons];
+
+	ionic_queue_consume(&qp->sq);
+
+	memset(wc, 0, sizeof(*wc));
+
+	wc->status = IB_WC_WR_FLUSH_ERR;
+	wc->wr_id = meta->wrid;
+	wc->qp = &qp->ibqp;
+
+	return 1;
+}
+
+static int ionic_flush_send_many(struct ionic_qp *qp,
+				 struct ib_wc *wc, int nwc)
+{
+	int rc = 0, npolled = 0;
+
+	while (npolled < nwc) {
+		rc = ionic_flush_send(qp, wc + npolled);
+		if (rc <= 0)
+			break;
+
+		npolled += rc;
+	}
+
+	return npolled ?: rc;
+}
+
+static int ionic_poll_recv(struct ionic_ibdev *dev, struct ionic_cq *cq,
+			   struct ionic_qp *cqe_qp, struct ionic_v1_cqe *cqe,
+			   struct ib_wc *wc)
+{
+	struct ionic_qp *qp = NULL;
+	struct ionic_rq_meta *meta;
+	u32 src_qpn, st_len;
+	u16 vlan_tag;
+	u8 op;
+
+	if (cqe_qp->rq_flush)
+		return 0;
+
+	qp = cqe_qp;
+
+	st_len = be32_to_cpu(cqe->status_length);
+
+	/* ignore wqe_id in case of flush error */
+	if (ionic_v1_cqe_error(cqe) && st_len == IONIC_STS_WQE_FLUSHED_ERR) {
+		cqe_qp->rq_flush = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_rq, &cq->flush_rq);
+
+		/* posted recvs (if any) flushed by ionic_flush_recv */
+		return 0;
+	}
+
+	/* there had better be something in the recv queue to complete */
+	if (ionic_queue_empty(&qp->rq)) {
+		ibdev_warn(&dev->ibdev, "qp %u is empty\n", qp->qpid);
+		return -EIO;
+	}
+
+	/* wqe_id must be a valid queue index */
+	if (unlikely(cqe->recv.wqe_id >> qp->rq.depth_log2)) {
+		ibdev_warn(&dev->ibdev,
+			   "qp %u recv index %llu invalid\n",
+			   qp->qpid, (unsigned long long)cqe->recv.wqe_id);
+		return -EIO;
+	}
+
+	/* wqe_id must indicate a request that is outstanding */
+	meta = &qp->rq_meta[cqe->recv.wqe_id];
+	if (unlikely(meta->next != IONIC_META_POSTED)) {
+		ibdev_warn(&dev->ibdev,
+			   "qp %u recv index %llu not posted\n",
+			   qp->qpid, (unsigned long long)cqe->recv.wqe_id);
+		return -EIO;
+	}
+
+	meta->next = qp->rq_meta_head;
+	qp->rq_meta_head = meta;
+
+	memset(wc, 0, sizeof(*wc));
+
+	wc->wr_id = meta->wrid;
+
+	wc->qp = &cqe_qp->ibqp;
+
+	if (ionic_v1_cqe_error(cqe)) {
+		wc->vendor_err = st_len;
+		wc->status = ionic_to_ib_status(st_len);
+
+		cqe_qp->rq_flush = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_rq, &cq->flush_rq);
+
+		ibdev_warn(&dev->ibdev,
+			   "qp %d recv cqe with error\n", qp->qpid);
+		print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1,
+			       cqe, BIT(cq->q.stride_log2), true);
+		goto out;
+	}
+
+	wc->vendor_err = 0;
+	wc->status = IB_WC_SUCCESS;
+
+	src_qpn = be32_to_cpu(cqe->recv.src_qpn_op);
+	op = src_qpn >> IONIC_V1_CQE_RECV_OP_SHIFT;
+
+	src_qpn &= IONIC_V1_CQE_RECV_QPN_MASK;
+	op &= IONIC_V1_CQE_RECV_OP_MASK;
+
+	wc->opcode = IB_WC_RECV;
+	switch (op) {
+	case IONIC_V1_CQE_RECV_OP_RDMA_IMM:
+		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->recv.imm_data_rkey; /* be32 in wc */
+		break;
+	case IONIC_V1_CQE_RECV_OP_SEND_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->recv.imm_data_rkey; /* be32 in wc */
+		break;
+	case IONIC_V1_CQE_RECV_OP_SEND_INV:
+		wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->recv.imm_data_rkey);
+		break;
+	}
+
+	wc->byte_len = st_len;
+	wc->src_qp = src_qpn;
+
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_GSI) {
+		wc->wc_flags |= IB_WC_GRH | IB_WC_WITH_SMAC;
+		ether_addr_copy(wc->smac, cqe->recv.src_mac);
+
+		wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+		if (ionic_v1_cqe_recv_is_ipv4(cqe))
+			wc->network_hdr_type = RDMA_NETWORK_IPV4;
+		else
+			wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+		if (ionic_v1_cqe_recv_is_vlan(cqe))
+			wc->wc_flags |= IB_WC_WITH_VLAN;
+
+		/* vlan_tag in cqe will be valid from dpath even if no vlan */
+		vlan_tag = be16_to_cpu(cqe->recv.vlan_tag);
+		wc->vlan_id = vlan_tag & 0xfff; /* 802.1q VID */
+		wc->sl = vlan_tag >> VLAN_PRIO_SHIFT; /* 802.1q PCP */
+	}
+
+	wc->pkey_index = 0;
+	wc->port_num = 1;
+
+out:
+	ionic_queue_consume(&qp->rq);
+
+	return 1;
+}
+
+static bool ionic_peek_send(struct ionic_qp *qp)
+{
+	struct ionic_sq_meta *meta;
+
+	if (qp->sq_flush)
+		return false;
+
+	/* completed all send queue requests */
+	if (ionic_queue_empty(&qp->sq))
+		return false;
+
+	meta = &qp->sq_meta[qp->sq.cons];
+
+	/* waiting for remote completion */
+	if (meta->remote && meta->seq == qp->sq_msn_cons)
+		return false;
+
+	/* waiting for local completion */
+	if (!meta->remote && !meta->local_comp)
+		return false;
+
+	return true;
+}
+
+static int ionic_poll_send(struct ionic_ibdev *dev, struct ionic_cq *cq,
+			   struct ionic_qp *qp, struct ib_wc *wc)
+{
+	struct ionic_sq_meta *meta;
+
+	if (qp->sq_flush)
+		return 0;
+
+	do {
+		/* completed all send queue requests */
+		if (ionic_queue_empty(&qp->sq))
+			goto out_empty;
+
+		meta = &qp->sq_meta[qp->sq.cons];
+
+		/* waiting for remote completion */
+		if (meta->remote && meta->seq == qp->sq_msn_cons)
+			goto out_empty;
+
+		/* waiting for local completion */
+		if (!meta->remote && !meta->local_comp)
+			goto out_empty;
+
+		ionic_queue_consume(&qp->sq);
+
+		/* produce wc only if signaled or error status */
+	} while (!meta->signal && meta->ibsts == IB_WC_SUCCESS);
+
+	memset(wc, 0, sizeof(*wc));
+
+	wc->status = meta->ibsts;
+	wc->wr_id = meta->wrid;
+	wc->qp = &qp->ibqp;
+
+	if (meta->ibsts == IB_WC_SUCCESS) {
+		wc->byte_len = meta->len;
+		wc->opcode = meta->ibop;
+	} else {
+		wc->vendor_err = meta->len;
+
+		qp->sq_flush = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_sq, &cq->flush_sq);
+	}
+
+	return 1;
+
+out_empty:
+	if (qp->sq_flush_rcvd) {
+		qp->sq_flush = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_sq, &cq->flush_sq);
+	}
+	return 0;
+}
+
+static int ionic_poll_send_many(struct ionic_ibdev *dev, struct ionic_cq *cq,
+				struct ionic_qp *qp, struct ib_wc *wc, int nwc)
+{
+	int rc = 0, npolled = 0;
+
+	while (npolled < nwc) {
+		rc = ionic_poll_send(dev, cq, qp, wc + npolled);
+		if (rc <= 0)
+			break;
+
+		npolled += rc;
+	}
+
+	return npolled ?: rc;
+}
+
+static int ionic_validate_cons(u16 prod, u16 cons,
+			       u16 comp, u16 mask)
+{
+	if (((prod - cons) & mask) <= ((comp - cons) & mask))
+		return -EIO;
+
+	return 0;
+}
+
+static int ionic_comp_msn(struct ionic_qp *qp, struct ionic_v1_cqe *cqe)
+{
+	struct ionic_sq_meta *meta;
+	u16 cqe_seq, cqe_idx;
+	int rc;
+
+	if (qp->sq_flush)
+		return 0;
+
+	cqe_seq = be32_to_cpu(cqe->send.msg_msn) & qp->sq.mask;
+
+	rc = ionic_validate_cons(qp->sq_msn_prod,
+				 qp->sq_msn_cons,
+				 cqe_seq - 1,
+				 qp->sq.mask);
+	if (rc) {
+		ibdev_warn(qp->ibqp.device,
+			   "qp %u bad msn %#x seq %u for prod %u cons %u\n",
+			   qp->qpid, be32_to_cpu(cqe->send.msg_msn),
+			   cqe_seq, qp->sq_msn_prod, qp->sq_msn_cons);
+		return rc;
+	}
+
+	qp->sq_msn_cons = cqe_seq;
+
+	if (ionic_v1_cqe_error(cqe)) {
+		cqe_idx = qp->sq_msn_idx[(cqe_seq - 1) & qp->sq.mask];
+
+		meta = &qp->sq_meta[cqe_idx];
+		meta->len = be32_to_cpu(cqe->status_length);
+		meta->ibsts = ionic_to_ib_status(meta->len);
+
+		ibdev_warn(qp->ibqp.device,
+			   "qp %d msn cqe with error\n", qp->qpid);
+		print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1,
+			       cqe, sizeof(*cqe), true);
+	}
+
+	return 0;
+}
+
+static int ionic_comp_npg(struct ionic_qp *qp, struct ionic_v1_cqe *cqe)
+{
+	struct ionic_sq_meta *meta;
+	u16 cqe_idx;
+	u32 st_len;
+
+	if (qp->sq_flush)
+		return 0;
+
+	st_len = be32_to_cpu(cqe->status_length);
+
+	if (ionic_v1_cqe_error(cqe) && st_len == IONIC_STS_WQE_FLUSHED_ERR) {
+		/*
+		 * Flush cqe does not consume a wqe on the device, and maybe
+		 * no such work request is posted.
+		 *
+		 * The driver should begin flushing after the last indicated
+		 * normal or error completion.	Here, only set a hint that the
+		 * flush request was indicated.	 In poll_send, if nothing more
+		 * can be polled normally, then begin flushing.
+		 */
+		qp->sq_flush_rcvd = true;
+		return 0;
+	}
+
+	cqe_idx = cqe->send.npg_wqe_id & qp->sq.mask;
+	meta = &qp->sq_meta[cqe_idx];
+	meta->local_comp = true;
+
+	if (ionic_v1_cqe_error(cqe)) {
+		meta->len = st_len;
+		meta->ibsts = ionic_to_ib_status(st_len);
+		meta->remote = false;
+		ibdev_warn(qp->ibqp.device,
+			   "qp %d npg cqe with error\n", qp->qpid);
+		print_hex_dump(KERN_WARNING, "cqe ", DUMP_PREFIX_OFFSET, 16, 1,
+			       cqe, sizeof(*cqe), true);
+	}
+
+	return 0;
+}
+
+static void ionic_reserve_sync_cq(struct ionic_ibdev *dev, struct ionic_cq *cq)
+{
+	if (!ionic_queue_empty(&cq->q)) {
+		cq->credit += ionic_queue_length(&cq->q);
+		cq->q.cons = cq->q.prod;
+
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype,
+				 ionic_queue_dbell_val(&cq->q));
+	}
+}
+
+static void ionic_reserve_cq(struct ionic_ibdev *dev, struct ionic_cq *cq,
+			     int spend)
+{
+	cq->credit -= spend;
+
+	if (cq->credit <= 0)
+		ionic_reserve_sync_cq(dev, cq);
+}
+
+static int ionic_poll_vcq_cq(struct ionic_ibdev *dev,
+			     struct ionic_cq *cq,
+			     int nwc, struct ib_wc *wc)
+{
+	struct ionic_qp *qp, *qp_next;
+	struct ionic_v1_cqe *cqe;
+	int rc = 0, npolled = 0;
+	unsigned long irqflags;
+	u32 qtf, qid;
+	bool peek;
+	u8 type;
+
+	if (nwc < 1)
+		return 0;
+
+	spin_lock_irqsave(&cq->lock, irqflags);
+
+	/* poll already indicated work completions for send queue */
+	list_for_each_entry_safe(qp, qp_next, &cq->poll_sq, cq_poll_sq) {
+		if (npolled == nwc)
+			goto out;
+
+		spin_lock(&qp->sq_lock);
+		rc = ionic_poll_send_many(dev, cq, qp, wc + npolled,
+					  nwc - npolled);
+		spin_unlock(&qp->sq_lock);
+
+		if (rc > 0)
+			npolled += rc;
+
+		if (npolled < nwc)
+			list_del_init(&qp->cq_poll_sq);
+	}
+
+	/* poll for more work completions */
+	while (likely(ionic_next_cqe(dev, cq, &cqe))) {
+		if (npolled == nwc)
+			goto out;
+
+		qtf = ionic_v1_cqe_qtf(cqe);
+		qid = ionic_v1_cqe_qtf_qid(qtf);
+		type = ionic_v1_cqe_qtf_type(qtf);
+
+		/*
+		 * Safe to access QP without additional reference here as,
+		 * 1. We hold cq->lock throughout
+		 * 2. ionic_destroy_qp() acquires the same cq->lock before cleanup
+		 * 3. QP is removed from qp_tbl before any cleanup begins
+		 * This ensures no concurrent access between polling and destruction.
+		 */
+		qp = xa_load(&dev->qp_tbl, qid);
+		if (unlikely(!qp)) {
+			ibdev_dbg(&dev->ibdev, "missing qp for qid %u\n", qid);
+			goto cq_next;
+		}
+
+		switch (type) {
+		case IONIC_V1_CQE_TYPE_RECV:
+			spin_lock(&qp->rq_lock);
+			rc = ionic_poll_recv(dev, cq, qp, cqe, wc + npolled);
+			spin_unlock(&qp->rq_lock);
+
+			if (rc < 0)
+				goto out;
+
+			npolled += rc;
+
+			break;
+
+		case IONIC_V1_CQE_TYPE_SEND_MSN:
+			spin_lock(&qp->sq_lock);
+			rc = ionic_comp_msn(qp, cqe);
+			if (!rc) {
+				rc = ionic_poll_send_many(dev, cq, qp,
+							  wc + npolled,
+							  nwc - npolled);
+				peek = ionic_peek_send(qp);
+			}
+			spin_unlock(&qp->sq_lock);
+
+			if (rc < 0)
+				goto out;
+
+			npolled += rc;
+
+			if (peek)
+				list_move_tail(&qp->cq_poll_sq, &cq->poll_sq);
+			break;
+
+		case IONIC_V1_CQE_TYPE_SEND_NPG:
+			spin_lock(&qp->sq_lock);
+			rc = ionic_comp_npg(qp, cqe);
+			if (!rc) {
+				rc = ionic_poll_send_many(dev, cq, qp,
+							  wc + npolled,
+							  nwc - npolled);
+				peek = ionic_peek_send(qp);
+			}
+			spin_unlock(&qp->sq_lock);
+
+			if (rc < 0)
+				goto out;
+
+			npolled += rc;
+
+			if (peek)
+				list_move_tail(&qp->cq_poll_sq, &cq->poll_sq);
+			break;
+
+		default:
+			ibdev_warn(&dev->ibdev,
+				   "unexpected cqe type %u\n", type);
+			rc = -EIO;
+			goto out;
+		}
+
+cq_next:
+		ionic_queue_produce(&cq->q);
+		cq->color = ionic_color_wrap(cq->q.prod, cq->color);
+	}
+
+	/* lastly, flush send and recv queues */
+	if (likely(!cq->flush))
+		goto out;
+
+	cq->flush = false;
+
+	list_for_each_entry_safe(qp, qp_next, &cq->flush_sq, cq_flush_sq) {
+		if (npolled == nwc)
+			goto out;
+
+		spin_lock(&qp->sq_lock);
+		rc = ionic_flush_send_many(qp, wc + npolled, nwc - npolled);
+		spin_unlock(&qp->sq_lock);
+
+		if (rc > 0)
+			npolled += rc;
+
+		if (npolled < nwc)
+			list_del_init(&qp->cq_flush_sq);
+		else
+			cq->flush = true;
+	}
+
+	list_for_each_entry_safe(qp, qp_next, &cq->flush_rq, cq_flush_rq) {
+		if (npolled == nwc)
+			goto out;
+
+		spin_lock(&qp->rq_lock);
+		rc = ionic_flush_recv_many(qp, wc + npolled, nwc - npolled);
+		spin_unlock(&qp->rq_lock);
+
+		if (rc > 0)
+			npolled += rc;
+
+		if (npolled < nwc)
+			list_del_init(&qp->cq_flush_rq);
+		else
+			cq->flush = true;
+	}
+
+out:
+	/* in case credit was depleted (more work posted than cq depth) */
+	if (cq->credit <= 0)
+		ionic_reserve_sync_cq(dev, cq);
+
+	spin_unlock_irqrestore(&cq->lock, irqflags);
+
+	return npolled ?: rc;
+}
+
+int ionic_poll_cq(struct ib_cq *ibcq, int nwc, struct ib_wc *wc)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibcq);
+	int rc_tmp, rc = 0, npolled = 0;
+	int cq_i, cq_x, cq_ix;
+
+	cq_x = vcq->poll_idx;
+	vcq->poll_idx ^= dev->lif_cfg.udma_count - 1;
+
+	for (cq_i = 0; npolled < nwc && cq_i < dev->lif_cfg.udma_count; ++cq_i) {
+		cq_ix = cq_i ^ cq_x;
+
+		if (!(vcq->udma_mask & BIT(cq_ix)))
+			continue;
+
+		rc_tmp = ionic_poll_vcq_cq(dev, &vcq->cq[cq_ix],
+					   nwc - npolled,
+					   wc + npolled);
+
+		if (rc_tmp >= 0)
+			npolled += rc_tmp;
+		else if (!rc)
+			rc = rc_tmp;
+	}
+
+	return npolled ?: rc;
+}
+
+static int ionic_req_notify_vcq_cq(struct ionic_ibdev *dev, struct ionic_cq *cq,
+				   enum ib_cq_notify_flags flags)
+{
+	u64 dbell_val = cq->q.dbell;
+
+	if (flags & IB_CQ_SOLICITED) {
+		cq->arm_sol_prod = ionic_queue_next(&cq->q, cq->arm_sol_prod);
+		dbell_val |= cq->arm_sol_prod | IONIC_CQ_RING_SOL;
+	} else {
+		cq->arm_any_prod = ionic_queue_next(&cq->q, cq->arm_any_prod);
+		dbell_val |= cq->arm_any_prod | IONIC_CQ_RING_ARM;
+	}
+
+	ionic_reserve_sync_cq(dev, cq);
+
+	ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.cq_qtype, dbell_val);
+
+	/*
+	 * IB_CQ_REPORT_MISSED_EVENTS:
+	 *
+	 * The queue index in ring zero guarantees no missed events.
+	 *
+	 * Here, we check if the color bit in the next cqe is flipped.	If it
+	 * is flipped, then progress can be made by immediately polling the cq.
+	 * Still, the cq will be armed, and an event will be generated.	 The cq
+	 * may be empty when polled after the event, because the next poll
+	 * after arming the cq can empty it.
+	 */
+	return (flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+		cq->color == ionic_v1_cqe_color(ionic_queue_at_prod(&cq->q));
+}
+
+int ionic_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibcq->device);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibcq);
+	int rc = 0, cq_i;
+
+	for (cq_i = 0; cq_i < dev->lif_cfg.udma_count; ++cq_i) {
+		if (!(vcq->udma_mask & BIT(cq_i)))
+			continue;
+
+		if (ionic_req_notify_vcq_cq(dev, &vcq->cq[cq_i], flags))
+			rc = 1;
+	}
+
+	return rc;
+}
+
+static s64 ionic_prep_inline(void *data, u32 max_data,
+			     const struct ib_sge *ib_sgl, int num_sge)
+{
+	static const s64 bit_31 = 1u << 31;
+	s64 len = 0, sg_len;
+	int sg_i;
+
+	for (sg_i = 0; sg_i < num_sge; ++sg_i) {
+		sg_len = ib_sgl[sg_i].length;
+
+		/* sge length zero means 2GB */
+		if (unlikely(sg_len == 0))
+			sg_len = bit_31;
+
+		/* greater than max inline data is invalid */
+		if (unlikely(len + sg_len > max_data))
+			return -EINVAL;
+
+		memcpy(data + len, (void *)ib_sgl[sg_i].addr, sg_len);
+
+		len += sg_len;
+	}
+
+	return len;
+}
+
+static s64 ionic_prep_pld(struct ionic_v1_wqe *wqe,
+			  union ionic_v1_pld *pld,
+			  int spec, u32 max_sge,
+			  const struct ib_sge *ib_sgl,
+			  int num_sge)
+{
+	static const s64 bit_31 = 1l << 31;
+	struct ionic_sge *sgl;
+	__be32 *spec32 = NULL;
+	__be16 *spec16 = NULL;
+	s64 len = 0, sg_len;
+	int sg_i = 0;
+
+	if (unlikely(num_sge < 0 || (u32)num_sge > max_sge))
+		return -EINVAL;
+
+	if (spec && num_sge > IONIC_V1_SPEC_FIRST_SGE) {
+		sg_i = IONIC_V1_SPEC_FIRST_SGE;
+
+		if (num_sge > 8) {
+			wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SPEC16);
+			spec16 = pld->spec16;
+		} else {
+			wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SPEC32);
+			spec32 = pld->spec32;
+		}
+	}
+
+	sgl = &pld->sgl[sg_i];
+
+	for (sg_i = 0; sg_i < num_sge; ++sg_i) {
+		sg_len = ib_sgl[sg_i].length;
+
+		/* sge length zero means 2GB */
+		if (unlikely(sg_len == 0))
+			sg_len = bit_31;
+
+		/* greater than 2GB data is invalid */
+		if (unlikely(len + sg_len > bit_31))
+			return -EINVAL;
+
+		sgl[sg_i].va = cpu_to_be64(ib_sgl[sg_i].addr);
+		sgl[sg_i].len = cpu_to_be32(sg_len);
+		sgl[sg_i].lkey = cpu_to_be32(ib_sgl[sg_i].lkey);
+
+		if (spec32) {
+			spec32[sg_i] = sgl[sg_i].len;
+		} else if (spec16) {
+			if (unlikely(sg_len > U16_MAX))
+				return -EINVAL;
+			spec16[sg_i] = cpu_to_be16(sg_len);
+		}
+
+		len += sg_len;
+	}
+
+	return len;
+}
+
+static void ionic_prep_base(struct ionic_qp *qp,
+			    const struct ib_send_wr *wr,
+			    struct ionic_sq_meta *meta,
+			    struct ionic_v1_wqe *wqe)
+{
+	meta->wrid = wr->wr_id;
+	meta->ibsts = IB_WC_SUCCESS;
+	meta->signal = false;
+	meta->local_comp = false;
+
+	wqe->base.wqe_id = qp->sq.prod;
+
+	if (wr->send_flags & IB_SEND_FENCE)
+		wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_FENCE);
+
+	if (wr->send_flags & IB_SEND_SOLICITED)
+		wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SOL);
+
+	if (qp->sig_all || wr->send_flags & IB_SEND_SIGNALED) {
+		wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_SIG);
+		meta->signal = true;
+	}
+
+	meta->seq = qp->sq_msn_prod;
+	meta->remote =
+		qp->ibqp.qp_type != IB_QPT_UD &&
+		qp->ibqp.qp_type != IB_QPT_GSI &&
+		!ionic_ibop_is_local(wr->opcode);
+
+	if (meta->remote) {
+		qp->sq_msn_idx[meta->seq] = qp->sq.prod;
+		qp->sq_msn_prod = ionic_queue_next(&qp->sq, qp->sq_msn_prod);
+	}
+
+	ionic_queue_produce(&qp->sq);
+}
+
+static int ionic_prep_common(struct ionic_qp *qp,
+			     const struct ib_send_wr *wr,
+			     struct ionic_sq_meta *meta,
+			     struct ionic_v1_wqe *wqe)
+{
+	s64 signed_len;
+	u32 mval;
+
+	if (wr->send_flags & IB_SEND_INLINE) {
+		wqe->base.num_sge_key = 0;
+		wqe->base.flags |= cpu_to_be16(IONIC_V1_FLAG_INL);
+		mval = ionic_v1_send_wqe_max_data(qp->sq.stride_log2, false);
+		signed_len = ionic_prep_inline(wqe->common.pld.data, mval,
+					       wr->sg_list, wr->num_sge);
+	} else {
+		wqe->base.num_sge_key = wr->num_sge;
+		mval = ionic_v1_send_wqe_max_sge(qp->sq.stride_log2,
+						 qp->sq_spec,
+						 false);
+		signed_len = ionic_prep_pld(wqe, &wqe->common.pld,
+					    qp->sq_spec, mval,
+					    wr->sg_list, wr->num_sge);
+	}
+
+	if (unlikely(signed_len < 0))
+		return signed_len;
+
+	meta->len = signed_len;
+	wqe->common.length = cpu_to_be32(signed_len);
+
+	ionic_prep_base(qp, wr, meta, wqe);
+
+	return 0;
+}
+
+static void ionic_prep_sq_wqe(struct ionic_qp *qp, void *wqe)
+{
+	memset(wqe, 0, 1u << qp->sq.stride_log2);
+}
+
+static void ionic_prep_rq_wqe(struct ionic_qp *qp, void *wqe)
+{
+	memset(wqe, 0, 1u << qp->rq.stride_log2);
+}
+
+static int ionic_prep_send(struct ionic_qp *qp,
+			   const struct ib_send_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	meta->ibop = IB_WC_SEND;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND);
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_IMM);
+		wqe->base.imm_data_key = wr->ex.imm_data;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_INV);
+		wqe->base.imm_data_key =
+			cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return ionic_prep_common(qp, wr, meta, wqe);
+}
+
+static int ionic_prep_send_ud(struct ionic_qp *qp,
+			      const struct ib_ud_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+	struct ionic_ah *ah;
+
+	if (unlikely(!wr->ah))
+		return -EINVAL;
+
+	ah = to_ionic_ah(wr->ah);
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	wqe->common.send.ah_id = cpu_to_be32(ah->ahid);
+	wqe->common.send.dest_qpn = cpu_to_be32(wr->remote_qpn);
+	wqe->common.send.dest_qkey = cpu_to_be32(wr->remote_qkey);
+
+	meta->ibop = IB_WC_SEND;
+
+	switch (wr->wr.opcode) {
+	case IB_WR_SEND:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND);
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, SEND_IMM);
+		wqe->base.imm_data_key = wr->wr.ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return ionic_prep_common(qp, &wr->wr, meta, wqe);
+}
+
+static int ionic_prep_rdma(struct ionic_qp *qp,
+			   const struct ib_rdma_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	meta->ibop = IB_WC_RDMA_WRITE;
+
+	switch (wr->wr.opcode) {
+	case IB_WR_RDMA_READ:
+		if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE))
+			return -EINVAL;
+		meta->ibop = IB_WC_RDMA_READ;
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_READ);
+		break;
+	case IB_WR_RDMA_WRITE:
+		if (wr->wr.send_flags & IB_SEND_SOLICITED)
+			return -EINVAL;
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_WRITE);
+		break;
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, RDMA_WRITE_IMM);
+		wqe->base.imm_data_key = wr->wr.ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	wqe->common.rdma.remote_va_high = cpu_to_be32(wr->remote_addr >> 32);
+	wqe->common.rdma.remote_va_low = cpu_to_be32(wr->remote_addr);
+	wqe->common.rdma.remote_rkey = cpu_to_be32(wr->rkey);
+
+	return ionic_prep_common(qp, &wr->wr, meta, wqe);
+}
+
+static int ionic_prep_atomic(struct ionic_qp *qp,
+			     const struct ib_atomic_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+
+	if (wr->wr.num_sge != 1 || wr->wr.sg_list[0].length != 8)
+		return -EINVAL;
+
+	if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE))
+		return -EINVAL;
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	meta->ibop = IB_WC_RDMA_WRITE;
+
+	switch (wr->wr.opcode) {
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		meta->ibop = IB_WC_COMP_SWAP;
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, ATOMIC_CS);
+		wqe->atomic.swap_add_high = cpu_to_be32(wr->swap >> 32);
+		wqe->atomic.swap_add_low = cpu_to_be32(wr->swap);
+		wqe->atomic.compare_high = cpu_to_be32(wr->compare_add >> 32);
+		wqe->atomic.compare_low = cpu_to_be32(wr->compare_add);
+		break;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		meta->ibop = IB_WC_FETCH_ADD;
+		wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, ATOMIC_FA);
+		wqe->atomic.swap_add_high = cpu_to_be32(wr->compare_add >> 32);
+		wqe->atomic.swap_add_low = cpu_to_be32(wr->compare_add);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	wqe->atomic.remote_va_high = cpu_to_be32(wr->remote_addr >> 32);
+	wqe->atomic.remote_va_low = cpu_to_be32(wr->remote_addr);
+	wqe->atomic.remote_rkey = cpu_to_be32(wr->rkey);
+
+	wqe->base.num_sge_key = 1;
+	wqe->atomic.sge.va = cpu_to_be64(wr->wr.sg_list[0].addr);
+	wqe->atomic.sge.len = cpu_to_be32(8);
+	wqe->atomic.sge.lkey = cpu_to_be32(wr->wr.sg_list[0].lkey);
+
+	return ionic_prep_common(qp, &wr->wr, meta, wqe);
+}
+
+static int ionic_prep_inv(struct ionic_qp *qp,
+			  const struct ib_send_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+
+	if (wr->send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE))
+		return -EINVAL;
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, LOCAL_INV);
+	wqe->base.imm_data_key = cpu_to_be32(wr->ex.invalidate_rkey);
+
+	meta->len = 0;
+	meta->ibop = IB_WC_LOCAL_INV;
+
+	ionic_prep_base(qp, wr, meta, wqe);
+
+	return 0;
+}
+
+static int ionic_prep_reg(struct ionic_qp *qp,
+			  const struct ib_reg_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	struct ionic_mr *mr = to_ionic_mr(wr->mr);
+	struct ionic_sq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+	__le64 dma_addr;
+	int flags;
+
+	if (wr->wr.send_flags & (IB_SEND_SOLICITED | IB_SEND_INLINE))
+		return -EINVAL;
+
+	/* must call ib_map_mr_sg before posting reg wr */
+	if (!mr->buf.tbl_pages)
+		return -EINVAL;
+
+	meta = &qp->sq_meta[qp->sq.prod];
+	wqe = ionic_queue_at_prod(&qp->sq);
+
+	ionic_prep_sq_wqe(qp, wqe);
+
+	flags = to_ionic_mr_flags(wr->access);
+
+	wqe->base.op = IONIC_OP(dev->lif_cfg.rdma_version, REG_MR);
+	wqe->base.num_sge_key = wr->key;
+	wqe->base.imm_data_key = cpu_to_be32(mr->ibmr.lkey);
+	wqe->reg_mr.va = cpu_to_be64(mr->ibmr.iova);
+	wqe->reg_mr.length = cpu_to_be64(mr->ibmr.length);
+	wqe->reg_mr.offset = ionic_pgtbl_off(&mr->buf, mr->ibmr.iova);
+	dma_addr = ionic_pgtbl_dma(&mr->buf, mr->ibmr.iova);
+	wqe->reg_mr.dma_addr = cpu_to_be64(le64_to_cpu(dma_addr));
+
+	wqe->reg_mr.map_count = cpu_to_be32(mr->buf.tbl_pages);
+	wqe->reg_mr.flags = cpu_to_be16(flags);
+	wqe->reg_mr.dir_size_log2 = 0;
+	wqe->reg_mr.page_size_log2 = order_base_2(mr->ibmr.page_size);
+
+	meta->len = 0;
+	meta->ibop = IB_WC_REG_MR;
+
+	ionic_prep_base(qp, &wr->wr, meta, wqe);
+
+	return 0;
+}
+
+static int ionic_prep_one_rc(struct ionic_qp *qp,
+			     const struct ib_send_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	int rc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_SEND_WITH_INV:
+		rc = ionic_prep_send(qp, wr);
+		break;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		rc = ionic_prep_rdma(qp, rdma_wr(wr));
+		break;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		rc = ionic_prep_atomic(qp, atomic_wr(wr));
+		break;
+	case IB_WR_LOCAL_INV:
+		rc = ionic_prep_inv(qp, wr);
+		break;
+	case IB_WR_REG_MR:
+		rc = ionic_prep_reg(qp, reg_wr(wr));
+		break;
+	default:
+		ibdev_dbg(&dev->ibdev, "invalid opcode %d\n", wr->opcode);
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static int ionic_prep_one_ud(struct ionic_qp *qp,
+			     const struct ib_send_wr *wr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(qp->ibqp.device);
+	int rc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		rc = ionic_prep_send_ud(qp, ud_wr(wr));
+		break;
+	default:
+		ibdev_dbg(&dev->ibdev, "invalid opcode %d\n", wr->opcode);
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static int ionic_prep_recv(struct ionic_qp *qp,
+			   const struct ib_recv_wr *wr)
+{
+	struct ionic_rq_meta *meta;
+	struct ionic_v1_wqe *wqe;
+	s64 signed_len;
+	u32 mval;
+
+	wqe = ionic_queue_at_prod(&qp->rq);
+
+	/* if wqe is owned by device, caller can try posting again soon */
+	if (wqe->base.flags & cpu_to_be16(IONIC_V1_FLAG_FENCE))
+		return -EAGAIN;
+
+	meta = qp->rq_meta_head;
+	if (unlikely(meta == IONIC_META_LAST) ||
+	    unlikely(meta == IONIC_META_POSTED))
+		return -EIO;
+
+	ionic_prep_rq_wqe(qp, wqe);
+
+	mval = ionic_v1_recv_wqe_max_sge(qp->rq.stride_log2, qp->rq_spec,
+					 false);
+	signed_len = ionic_prep_pld(wqe, &wqe->recv.pld,
+				    qp->rq_spec, mval,
+				    wr->sg_list, wr->num_sge);
+	if (signed_len < 0)
+		return signed_len;
+
+	meta->wrid = wr->wr_id;
+
+	wqe->base.wqe_id = meta - qp->rq_meta;
+	wqe->base.num_sge_key = wr->num_sge;
+
+	/* total length for recv goes in base imm_data_key */
+	wqe->base.imm_data_key = cpu_to_be32(signed_len);
+
+	ionic_queue_produce(&qp->rq);
+
+	qp->rq_meta_head = meta->next;
+	meta->next = IONIC_META_POSTED;
+
+	return 0;
+}
+
+static int ionic_post_send_common(struct ionic_ibdev *dev,
+				  struct ionic_vcq *vcq,
+				  struct ionic_cq *cq,
+				  struct ionic_qp *qp,
+				  const struct ib_send_wr *wr,
+				  const struct ib_send_wr **bad)
+{
+	unsigned long irqflags;
+	bool notify = false;
+	int spend, rc = 0;
+
+	if (!bad)
+		return -EINVAL;
+
+	if (!qp->has_sq) {
+		*bad = wr;
+		return -EINVAL;
+	}
+
+	if (qp->state < IB_QPS_RTS) {
+		*bad = wr;
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->sq_lock, irqflags);
+
+	while (wr) {
+		if (ionic_queue_full(&qp->sq)) {
+			ibdev_dbg(&dev->ibdev, "queue full");
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		if (qp->ibqp.qp_type == IB_QPT_UD ||
+		    qp->ibqp.qp_type == IB_QPT_GSI)
+			rc = ionic_prep_one_ud(qp, wr);
+		else
+			rc = ionic_prep_one_rc(qp, wr);
+		if (rc)
+			goto out;
+
+		wr = wr->next;
+	}
+
+out:
+	spin_unlock_irqrestore(&qp->sq_lock, irqflags);
+
+	spin_lock_irqsave(&cq->lock, irqflags);
+	spin_lock(&qp->sq_lock);
+
+	if (likely(qp->sq.prod != qp->sq_old_prod)) {
+		/* ring cq doorbell just in time */
+		spend = (qp->sq.prod - qp->sq_old_prod) & qp->sq.mask;
+		ionic_reserve_cq(dev, cq, spend);
+
+		qp->sq_old_prod = qp->sq.prod;
+
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.sq_qtype,
+				 ionic_queue_dbell_val(&qp->sq));
+	}
+
+	if (qp->sq_flush) {
+		notify = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_sq, &cq->flush_sq);
+	}
+
+	spin_unlock(&qp->sq_lock);
+	spin_unlock_irqrestore(&cq->lock, irqflags);
+
+	if (notify && vcq->ibcq.comp_handler)
+		vcq->ibcq.comp_handler(&vcq->ibcq, vcq->ibcq.cq_context);
+
+	*bad = wr;
+	return rc;
+}
+
+static int ionic_post_recv_common(struct ionic_ibdev *dev,
+				  struct ionic_vcq *vcq,
+				  struct ionic_cq *cq,
+				  struct ionic_qp *qp,
+				  const struct ib_recv_wr *wr,
+				  const struct ib_recv_wr **bad)
+{
+	unsigned long irqflags;
+	bool notify = false;
+	int spend, rc = 0;
+
+	if (!bad)
+		return -EINVAL;
+
+	if (!qp->has_rq) {
+		*bad = wr;
+		return -EINVAL;
+	}
+
+	if (qp->state < IB_QPS_INIT) {
+		*bad = wr;
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&qp->rq_lock, irqflags);
+
+	while (wr) {
+		if (ionic_queue_full(&qp->rq)) {
+			ibdev_dbg(&dev->ibdev, "queue full");
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		rc = ionic_prep_recv(qp, wr);
+		if (rc)
+			goto out;
+
+		wr = wr->next;
+	}
+
+out:
+	if (!cq) {
+		spin_unlock_irqrestore(&qp->rq_lock, irqflags);
+		goto out_unlocked;
+	}
+	spin_unlock_irqrestore(&qp->rq_lock, irqflags);
+
+	spin_lock_irqsave(&cq->lock, irqflags);
+	spin_lock(&qp->rq_lock);
+
+	if (likely(qp->rq.prod != qp->rq_old_prod)) {
+		/* ring cq doorbell just in time */
+		spend = (qp->rq.prod - qp->rq_old_prod) & qp->rq.mask;
+		ionic_reserve_cq(dev, cq, spend);
+
+		qp->rq_old_prod = qp->rq.prod;
+
+		ionic_dbell_ring(dev->lif_cfg.dbpage, dev->lif_cfg.rq_qtype,
+				 ionic_queue_dbell_val(&qp->rq));
+	}
+
+	if (qp->rq_flush) {
+		notify = true;
+		cq->flush = true;
+		list_move_tail(&qp->cq_flush_rq, &cq->flush_rq);
+	}
+
+	spin_unlock(&qp->rq_lock);
+	spin_unlock_irqrestore(&cq->lock, irqflags);
+
+	if (notify && vcq->ibcq.comp_handler)
+		vcq->ibcq.comp_handler(&vcq->ibcq, vcq->ibcq.cq_context);
+
+out_unlocked:
+	*bad = wr;
+	return rc;
+}
+
+int ionic_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+		    const struct ib_send_wr **bad)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibqp->send_cq);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	struct ionic_cq *cq =
+		to_ionic_vcq_cq(ibqp->send_cq, qp->udma_idx);
+
+	return ionic_post_send_common(dev, vcq, cq, qp, wr, bad);
+}
+
+int ionic_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+		    const struct ib_recv_wr **bad)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibqp->device);
+	struct ionic_vcq *vcq = to_ionic_vcq(ibqp->recv_cq);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	struct ionic_cq *cq =
+		to_ionic_vcq_cq(ibqp->recv_cq, qp->udma_idx);
+
+	return ionic_post_recv_common(dev, vcq, cq, qp, wr, bad);
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_fw.h b/drivers/infiniband/hw/ionic/ionic_fw.h
new file mode 100644
index 000000000000..adfbb89d856c
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_fw.h
@@ -0,0 +1,1029 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_FW_H_
+#define _IONIC_FW_H_
+
+#include <linux/kernel.h>
+#include <rdma/ib_verbs.h>
+
+/* common for ib spec */
+
+#define IONIC_EXP_DBELL_SZ		8
+
+enum ionic_mrid_bits {
+	IONIC_MRID_INDEX_SHIFT		= 8,
+};
+
+static inline u32 ionic_mrid(u32 index, u8 key)
+{
+	return (index << IONIC_MRID_INDEX_SHIFT) | key;
+}
+
+static inline u32 ionic_mrid_index(u32 lrkey)
+{
+	return lrkey >> IONIC_MRID_INDEX_SHIFT;
+}
+
+/* common to all versions */
+
+/* wqe scatter gather element */
+struct ionic_sge {
+	__be64				va;
+	__be32				len;
+	__be32				lkey;
+};
+
+/* admin queue mr type */
+enum ionic_mr_flags {
+	/* bits that determine mr access */
+	IONIC_MRF_LOCAL_WRITE		= BIT(0),
+	IONIC_MRF_REMOTE_WRITE		= BIT(1),
+	IONIC_MRF_REMOTE_READ		= BIT(2),
+	IONIC_MRF_REMOTE_ATOMIC		= BIT(3),
+	IONIC_MRF_MW_BIND		= BIT(4),
+	IONIC_MRF_ZERO_BASED		= BIT(5),
+	IONIC_MRF_ON_DEMAND		= BIT(6),
+	IONIC_MRF_PB			= BIT(7),
+	IONIC_MRF_ACCESS_MASK		= BIT(12) - 1,
+
+	/* bits that determine mr type */
+	IONIC_MRF_UKEY_EN		= BIT(13),
+	IONIC_MRF_IS_MW			= BIT(14),
+	IONIC_MRF_INV_EN		= BIT(15),
+
+	/* base flags combinations for mr types */
+	IONIC_MRF_USER_MR		= 0,
+	IONIC_MRF_PHYS_MR		= (IONIC_MRF_UKEY_EN |
+					   IONIC_MRF_INV_EN),
+	IONIC_MRF_MW_1			= (IONIC_MRF_UKEY_EN |
+					   IONIC_MRF_IS_MW),
+	IONIC_MRF_MW_2			= (IONIC_MRF_UKEY_EN |
+					   IONIC_MRF_IS_MW |
+					   IONIC_MRF_INV_EN),
+};
+
+static inline int to_ionic_mr_flags(int access)
+{
+	int flags = 0;
+
+	if (access & IB_ACCESS_LOCAL_WRITE)
+		flags |= IONIC_MRF_LOCAL_WRITE;
+
+	if (access & IB_ACCESS_REMOTE_READ)
+		flags |= IONIC_MRF_REMOTE_READ;
+
+	if (access & IB_ACCESS_REMOTE_WRITE)
+		flags |= IONIC_MRF_REMOTE_WRITE;
+
+	if (access & IB_ACCESS_REMOTE_ATOMIC)
+		flags |= IONIC_MRF_REMOTE_ATOMIC;
+
+	if (access & IB_ACCESS_MW_BIND)
+		flags |= IONIC_MRF_MW_BIND;
+
+	if (access & IB_ZERO_BASED)
+		flags |= IONIC_MRF_ZERO_BASED;
+
+	return flags;
+}
+
+enum ionic_qp_flags {
+	/* bits that determine qp access */
+	IONIC_QPF_REMOTE_WRITE		= BIT(0),
+	IONIC_QPF_REMOTE_READ		= BIT(1),
+	IONIC_QPF_REMOTE_ATOMIC		= BIT(2),
+
+	/* bits that determine other qp behavior */
+	IONIC_QPF_SQ_PB			= BIT(6),
+	IONIC_QPF_RQ_PB			= BIT(7),
+	IONIC_QPF_SQ_SPEC		= BIT(8),
+	IONIC_QPF_RQ_SPEC		= BIT(9),
+	IONIC_QPF_REMOTE_PRIVILEGED	= BIT(10),
+	IONIC_QPF_SQ_DRAINING		= BIT(11),
+	IONIC_QPF_SQD_NOTIFY		= BIT(12),
+	IONIC_QPF_SQ_CMB		= BIT(13),
+	IONIC_QPF_RQ_CMB		= BIT(14),
+	IONIC_QPF_PRIVILEGED		= BIT(15),
+};
+
+static inline int from_ionic_qp_flags(int flags)
+{
+	int access_flags = 0;
+
+	if (flags & IONIC_QPF_REMOTE_WRITE)
+		access_flags |= IB_ACCESS_REMOTE_WRITE;
+
+	if (flags & IONIC_QPF_REMOTE_READ)
+		access_flags |= IB_ACCESS_REMOTE_READ;
+
+	if (flags & IONIC_QPF_REMOTE_ATOMIC)
+		access_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return access_flags;
+}
+
+static inline int to_ionic_qp_flags(int access, bool sqd_notify,
+				    bool sq_is_cmb, bool rq_is_cmb,
+				    bool sq_spec, bool rq_spec,
+				    bool privileged, bool remote_privileged)
+{
+	int flags = 0;
+
+	if (access & IB_ACCESS_REMOTE_WRITE)
+		flags |= IONIC_QPF_REMOTE_WRITE;
+
+	if (access & IB_ACCESS_REMOTE_READ)
+		flags |= IONIC_QPF_REMOTE_READ;
+
+	if (access & IB_ACCESS_REMOTE_ATOMIC)
+		flags |= IONIC_QPF_REMOTE_ATOMIC;
+
+	if (sqd_notify)
+		flags |= IONIC_QPF_SQD_NOTIFY;
+
+	if (sq_is_cmb)
+		flags |= IONIC_QPF_SQ_CMB;
+
+	if (rq_is_cmb)
+		flags |= IONIC_QPF_RQ_CMB;
+
+	if (sq_spec)
+		flags |= IONIC_QPF_SQ_SPEC;
+
+	if (rq_spec)
+		flags |= IONIC_QPF_RQ_SPEC;
+
+	if (privileged)
+		flags |= IONIC_QPF_PRIVILEGED;
+
+	if (remote_privileged)
+		flags |= IONIC_QPF_REMOTE_PRIVILEGED;
+
+	return flags;
+}
+
+/* cqe non-admin status indicated in status_length field when err bit is set */
+enum ionic_status {
+	IONIC_STS_OK,
+	IONIC_STS_LOCAL_LEN_ERR,
+	IONIC_STS_LOCAL_QP_OPER_ERR,
+	IONIC_STS_LOCAL_PROT_ERR,
+	IONIC_STS_WQE_FLUSHED_ERR,
+	IONIC_STS_MEM_MGMT_OPER_ERR,
+	IONIC_STS_BAD_RESP_ERR,
+	IONIC_STS_LOCAL_ACC_ERR,
+	IONIC_STS_REMOTE_INV_REQ_ERR,
+	IONIC_STS_REMOTE_ACC_ERR,
+	IONIC_STS_REMOTE_OPER_ERR,
+	IONIC_STS_RETRY_EXCEEDED,
+	IONIC_STS_RNR_RETRY_EXCEEDED,
+	IONIC_STS_XRC_VIO_ERR,
+	IONIC_STS_LOCAL_SGL_INV_ERR,
+};
+
+static inline int ionic_to_ib_status(int sts)
+{
+	switch (sts) {
+	case IONIC_STS_OK:
+		return IB_WC_SUCCESS;
+	case IONIC_STS_LOCAL_LEN_ERR:
+		return IB_WC_LOC_LEN_ERR;
+	case IONIC_STS_LOCAL_QP_OPER_ERR:
+	case IONIC_STS_LOCAL_SGL_INV_ERR:
+		return IB_WC_LOC_QP_OP_ERR;
+	case IONIC_STS_LOCAL_PROT_ERR:
+		return IB_WC_LOC_PROT_ERR;
+	case IONIC_STS_WQE_FLUSHED_ERR:
+		return IB_WC_WR_FLUSH_ERR;
+	case IONIC_STS_MEM_MGMT_OPER_ERR:
+		return IB_WC_MW_BIND_ERR;
+	case IONIC_STS_BAD_RESP_ERR:
+		return IB_WC_BAD_RESP_ERR;
+	case IONIC_STS_LOCAL_ACC_ERR:
+		return IB_WC_LOC_ACCESS_ERR;
+	case IONIC_STS_REMOTE_INV_REQ_ERR:
+		return IB_WC_REM_INV_REQ_ERR;
+	case IONIC_STS_REMOTE_ACC_ERR:
+		return IB_WC_REM_ACCESS_ERR;
+	case IONIC_STS_REMOTE_OPER_ERR:
+		return IB_WC_REM_OP_ERR;
+	case IONIC_STS_RETRY_EXCEEDED:
+		return IB_WC_RETRY_EXC_ERR;
+	case IONIC_STS_RNR_RETRY_EXCEEDED:
+		return IB_WC_RNR_RETRY_EXC_ERR;
+	case IONIC_STS_XRC_VIO_ERR:
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+/* admin queue qp type */
+enum ionic_qp_type {
+	IONIC_QPT_RC,
+	IONIC_QPT_UC,
+	IONIC_QPT_RD,
+	IONIC_QPT_UD,
+	IONIC_QPT_SRQ,
+	IONIC_QPT_XRC_INI,
+	IONIC_QPT_XRC_TGT,
+	IONIC_QPT_XRC_SRQ,
+};
+
+static inline int to_ionic_qp_type(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_GSI:
+	case IB_QPT_UD:
+		return IONIC_QPT_UD;
+	case IB_QPT_RC:
+		return IONIC_QPT_RC;
+	case IB_QPT_UC:
+		return IONIC_QPT_UC;
+	case IB_QPT_XRC_INI:
+		return IONIC_QPT_XRC_INI;
+	case IB_QPT_XRC_TGT:
+		return IONIC_QPT_XRC_TGT;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* admin queue qp state */
+enum ionic_qp_state {
+	IONIC_QPS_RESET,
+	IONIC_QPS_INIT,
+	IONIC_QPS_RTR,
+	IONIC_QPS_RTS,
+	IONIC_QPS_SQD,
+	IONIC_QPS_SQE,
+	IONIC_QPS_ERR,
+};
+
+static inline int from_ionic_qp_state(enum ionic_qp_state state)
+{
+	switch (state) {
+	case IONIC_QPS_RESET:
+		return IB_QPS_RESET;
+	case IONIC_QPS_INIT:
+		return IB_QPS_INIT;
+	case IONIC_QPS_RTR:
+		return IB_QPS_RTR;
+	case IONIC_QPS_RTS:
+		return IB_QPS_RTS;
+	case IONIC_QPS_SQD:
+		return IB_QPS_SQD;
+	case IONIC_QPS_SQE:
+		return IB_QPS_SQE;
+	case IONIC_QPS_ERR:
+		return IB_QPS_ERR;
+	default:
+		return -EINVAL;
+	}
+}
+
+static inline int to_ionic_qp_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return IONIC_QPS_RESET;
+	case IB_QPS_INIT:
+		return IONIC_QPS_INIT;
+	case IB_QPS_RTR:
+		return IONIC_QPS_RTR;
+	case IB_QPS_RTS:
+		return IONIC_QPS_RTS;
+	case IB_QPS_SQD:
+		return IONIC_QPS_SQD;
+	case IB_QPS_SQE:
+		return IONIC_QPS_SQE;
+	case IB_QPS_ERR:
+		return IONIC_QPS_ERR;
+	default:
+		return 0;
+	}
+}
+
+static inline int to_ionic_qp_modify_state(enum ib_qp_state to_state,
+					   enum ib_qp_state from_state)
+{
+	return to_ionic_qp_state(to_state) |
+		(to_ionic_qp_state(from_state) << 4);
+}
+
+/* fw abi v1 */
+
+/* data payload part of v1 wqe */
+union ionic_v1_pld {
+	struct ionic_sge	sgl[2];
+	__be32			spec32[8];
+	__be16			spec16[16];
+	__u8			data[32];
+};
+
+/* completion queue v1 cqe */
+struct ionic_v1_cqe {
+	union {
+		struct {
+			__be16		cmd_idx;
+			__u8		cmd_op;
+			__u8		rsvd[17];
+			__le16		old_sq_cindex;
+			__le16		old_rq_cq_cindex;
+		} admin;
+		struct {
+			__u64		wqe_id;
+			__be32		src_qpn_op;
+			__u8		src_mac[6];
+			__be16		vlan_tag;
+			__be32		imm_data_rkey;
+		} recv;
+		struct {
+			__u8		rsvd[4];
+			__be32		msg_msn;
+			__u8		rsvd2[8];
+			__u64		npg_wqe_id;
+		} send;
+	};
+	__be32				status_length;
+	__be32				qid_type_flags;
+};
+
+/* bits for cqe recv */
+enum ionic_v1_cqe_src_qpn_bits {
+	IONIC_V1_CQE_RECV_QPN_MASK	= 0xffffff,
+	IONIC_V1_CQE_RECV_OP_SHIFT	= 24,
+
+	/* MASK could be 0x3, but need 0x1f for makeshift values:
+	 * OP_TYPE_RDMA_OPER_WITH_IMM, OP_TYPE_SEND_RCVD
+	 */
+	IONIC_V1_CQE_RECV_OP_MASK	= 0x1f,
+	IONIC_V1_CQE_RECV_OP_SEND	= 0,
+	IONIC_V1_CQE_RECV_OP_SEND_INV	= 1,
+	IONIC_V1_CQE_RECV_OP_SEND_IMM	= 2,
+	IONIC_V1_CQE_RECV_OP_RDMA_IMM	= 3,
+
+	IONIC_V1_CQE_RECV_IS_IPV4	= BIT(7 + IONIC_V1_CQE_RECV_OP_SHIFT),
+	IONIC_V1_CQE_RECV_IS_VLAN	= BIT(6 + IONIC_V1_CQE_RECV_OP_SHIFT),
+};
+
+/* bits for cqe qid_type_flags */
+enum ionic_v1_cqe_qtf_bits {
+	IONIC_V1_CQE_COLOR		= BIT(0),
+	IONIC_V1_CQE_ERROR		= BIT(1),
+	IONIC_V1_CQE_TYPE_SHIFT		= 5,
+	IONIC_V1_CQE_TYPE_MASK		= 0x7,
+	IONIC_V1_CQE_QID_SHIFT		= 8,
+
+	IONIC_V1_CQE_TYPE_ADMIN		= 0,
+	IONIC_V1_CQE_TYPE_RECV		= 1,
+	IONIC_V1_CQE_TYPE_SEND_MSN	= 2,
+	IONIC_V1_CQE_TYPE_SEND_NPG	= 3,
+};
+
+static inline bool ionic_v1_cqe_color(struct ionic_v1_cqe *cqe)
+{
+	return cqe->qid_type_flags & cpu_to_be32(IONIC_V1_CQE_COLOR);
+}
+
+static inline bool ionic_v1_cqe_error(struct ionic_v1_cqe *cqe)
+{
+	return cqe->qid_type_flags & cpu_to_be32(IONIC_V1_CQE_ERROR);
+}
+
+static inline bool ionic_v1_cqe_recv_is_ipv4(struct ionic_v1_cqe *cqe)
+{
+	return cqe->recv.src_qpn_op & cpu_to_be32(IONIC_V1_CQE_RECV_IS_IPV4);
+}
+
+static inline bool ionic_v1_cqe_recv_is_vlan(struct ionic_v1_cqe *cqe)
+{
+	return cqe->recv.src_qpn_op & cpu_to_be32(IONIC_V1_CQE_RECV_IS_VLAN);
+}
+
+static inline void ionic_v1_cqe_clean(struct ionic_v1_cqe *cqe)
+{
+	cqe->qid_type_flags |= cpu_to_be32(~0u << IONIC_V1_CQE_QID_SHIFT);
+}
+
+static inline u32 ionic_v1_cqe_qtf(struct ionic_v1_cqe *cqe)
+{
+	return be32_to_cpu(cqe->qid_type_flags);
+}
+
+static inline u8 ionic_v1_cqe_qtf_type(u32 qtf)
+{
+	return (qtf >> IONIC_V1_CQE_TYPE_SHIFT) & IONIC_V1_CQE_TYPE_MASK;
+}
+
+static inline u32 ionic_v1_cqe_qtf_qid(u32 qtf)
+{
+	return qtf >> IONIC_V1_CQE_QID_SHIFT;
+}
+
+/* v1 base wqe header */
+struct ionic_v1_base_hdr {
+	__u64				wqe_id;
+	__u8				op;
+	__u8				num_sge_key;
+	__be16				flags;
+	__be32				imm_data_key;
+};
+
+/* v1 receive wqe body */
+struct ionic_v1_recv_bdy {
+	__u8				rsvd[16];
+	union ionic_v1_pld		pld;
+};
+
+/* v1 send/rdma wqe body (common, has sgl) */
+struct ionic_v1_common_bdy {
+	union {
+		struct {
+			__be32		ah_id;
+			__be32		dest_qpn;
+			__be32		dest_qkey;
+		} send;
+		struct {
+			__be32		remote_va_high;
+			__be32		remote_va_low;
+			__be32		remote_rkey;
+		} rdma;
+	};
+	__be32				length;
+	union ionic_v1_pld		pld;
+};
+
+/* v1 atomic wqe body */
+struct ionic_v1_atomic_bdy {
+	__be32				remote_va_high;
+	__be32				remote_va_low;
+	__be32				remote_rkey;
+	__be32				swap_add_high;
+	__be32				swap_add_low;
+	__be32				compare_high;
+	__be32				compare_low;
+	__u8				rsvd[4];
+	struct ionic_sge		sge;
+};
+
+/* v1 reg mr wqe body */
+struct ionic_v1_reg_mr_bdy {
+	__be64				va;
+	__be64				length;
+	__be64				offset;
+	__be64				dma_addr;
+	__be32				map_count;
+	__be16				flags;
+	__u8				dir_size_log2;
+	__u8				page_size_log2;
+	__u8				rsvd[8];
+};
+
+/* v1 bind mw wqe body */
+struct ionic_v1_bind_mw_bdy {
+	__be64				va;
+	__be64				length;
+	__be32				lkey;
+	__be16				flags;
+	__u8				rsvd[26];
+};
+
+/* v1 send/recv wqe */
+struct ionic_v1_wqe {
+	struct ionic_v1_base_hdr	base;
+	union {
+		struct ionic_v1_recv_bdy	recv;
+		struct ionic_v1_common_bdy	common;
+		struct ionic_v1_atomic_bdy	atomic;
+		struct ionic_v1_reg_mr_bdy	reg_mr;
+		struct ionic_v1_bind_mw_bdy	bind_mw;
+	};
+};
+
+/* queue pair v1 send opcodes */
+enum ionic_v1_op {
+	IONIC_V1_OP_SEND,
+	IONIC_V1_OP_SEND_INV,
+	IONIC_V1_OP_SEND_IMM,
+	IONIC_V1_OP_RDMA_READ,
+	IONIC_V1_OP_RDMA_WRITE,
+	IONIC_V1_OP_RDMA_WRITE_IMM,
+	IONIC_V1_OP_ATOMIC_CS,
+	IONIC_V1_OP_ATOMIC_FA,
+	IONIC_V1_OP_REG_MR,
+	IONIC_V1_OP_LOCAL_INV,
+	IONIC_V1_OP_BIND_MW,
+
+	/* flags */
+	IONIC_V1_FLAG_FENCE		= BIT(0),
+	IONIC_V1_FLAG_SOL		= BIT(1),
+	IONIC_V1_FLAG_INL		= BIT(2),
+	IONIC_V1_FLAG_SIG		= BIT(3),
+
+	/* flags last four bits for sgl spec format */
+	IONIC_V1_FLAG_SPEC32		= (1u << 12),
+	IONIC_V1_FLAG_SPEC16		= (2u << 12),
+	IONIC_V1_SPEC_FIRST_SGE		= 2,
+};
+
+/* queue pair v2 send opcodes */
+enum ionic_v2_op {
+	IONIC_V2_OPSL_OUT          = 0x20,
+	IONIC_V2_OPSL_IMM          = 0x40,
+	IONIC_V2_OPSL_INV          = 0x80,
+
+	IONIC_V2_OP_SEND           = 0x0 | IONIC_V2_OPSL_OUT,
+	IONIC_V2_OP_SEND_IMM       = IONIC_V2_OP_SEND | IONIC_V2_OPSL_IMM,
+	IONIC_V2_OP_SEND_INV       = IONIC_V2_OP_SEND | IONIC_V2_OPSL_INV,
+
+	IONIC_V2_OP_RDMA_WRITE     = 0x1 | IONIC_V2_OPSL_OUT,
+	IONIC_V2_OP_RDMA_WRITE_IMM = IONIC_V2_OP_RDMA_WRITE | IONIC_V2_OPSL_IMM,
+
+	IONIC_V2_OP_RDMA_READ      = 0x2,
+
+	IONIC_V2_OP_ATOMIC_CS      = 0x4,
+	IONIC_V2_OP_ATOMIC_FA      = 0x5,
+	IONIC_V2_OP_REG_MR         = 0x6,
+	IONIC_V2_OP_LOCAL_INV      = 0x7,
+	IONIC_V2_OP_BIND_MW        = 0x8,
+};
+
+static inline size_t ionic_v1_send_wqe_min_size(int min_sge, int min_data,
+						int spec, bool expdb)
+{
+	size_t sz_wqe, sz_sgl, sz_data;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		min_sge += IONIC_V1_SPEC_FIRST_SGE;
+
+	if (expdb) {
+		min_sge += 1;
+		min_data += IONIC_EXP_DBELL_SZ;
+	}
+
+	sz_wqe = sizeof(struct ionic_v1_wqe);
+	sz_sgl = offsetof(struct ionic_v1_wqe, common.pld.sgl[min_sge]);
+	sz_data = offsetof(struct ionic_v1_wqe, common.pld.data[min_data]);
+
+	if (sz_sgl > sz_wqe)
+		sz_wqe = sz_sgl;
+
+	if (sz_data > sz_wqe)
+		sz_wqe = sz_data;
+
+	return sz_wqe;
+}
+
+static inline int ionic_v1_send_wqe_max_sge(u8 stride_log2, int spec,
+					    bool expdb)
+{
+	struct ionic_sge *sge = (void *)(1ull << stride_log2);
+	struct ionic_v1_wqe *wqe = (void *)0;
+	int num_sge = 0;
+
+	if (expdb)
+		sge -= 1;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		num_sge = IONIC_V1_SPEC_FIRST_SGE;
+
+	num_sge = sge - &wqe->common.pld.sgl[num_sge];
+
+	if (spec && num_sge > spec)
+		num_sge = spec;
+
+	return num_sge;
+}
+
+static inline int ionic_v1_send_wqe_max_data(u8 stride_log2, bool expdb)
+{
+	struct ionic_v1_wqe *wqe = (void *)0;
+	__u8 *data = (void *)(1ull << stride_log2);
+
+	if (expdb)
+		data -= IONIC_EXP_DBELL_SZ;
+
+	return data - wqe->common.pld.data;
+}
+
+static inline size_t ionic_v1_recv_wqe_min_size(int min_sge, int spec,
+						bool expdb)
+{
+	size_t sz_wqe, sz_sgl;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		min_sge += IONIC_V1_SPEC_FIRST_SGE;
+
+	if (expdb)
+		min_sge += 1;
+
+	sz_wqe = sizeof(struct ionic_v1_wqe);
+	sz_sgl = offsetof(struct ionic_v1_wqe, recv.pld.sgl[min_sge]);
+
+	if (sz_sgl > sz_wqe)
+		sz_wqe = sz_sgl;
+
+	return sz_wqe;
+}
+
+static inline int ionic_v1_recv_wqe_max_sge(u8 stride_log2, int spec,
+					    bool expdb)
+{
+	struct ionic_sge *sge = (void *)(1ull << stride_log2);
+	struct ionic_v1_wqe *wqe = (void *)0;
+	int num_sge = 0;
+
+	if (expdb)
+		sge -= 1;
+
+	if (spec > IONIC_V1_SPEC_FIRST_SGE)
+		num_sge = IONIC_V1_SPEC_FIRST_SGE;
+
+	num_sge = sge - &wqe->recv.pld.sgl[num_sge];
+
+	if (spec && num_sge > spec)
+		num_sge = spec;
+
+	return num_sge;
+}
+
+static inline int ionic_v1_use_spec_sge(int min_sge, int spec)
+{
+	if (!spec || min_sge > spec)
+		return 0;
+
+	if (min_sge <= IONIC_V1_SPEC_FIRST_SGE)
+		return IONIC_V1_SPEC_FIRST_SGE;
+
+	return spec;
+}
+
+struct ionic_admin_stats_hdr {
+	__le64		dma_addr;
+	__le32		length;
+	__le32		id_ver;
+	__u8		type_state;
+} __packed;
+
+#define IONIC_ADMIN_STATS_HDRS_IN_V1_LEN 17
+static_assert(sizeof(struct ionic_admin_stats_hdr) ==
+	       IONIC_ADMIN_STATS_HDRS_IN_V1_LEN);
+
+struct ionic_admin_create_ah {
+	__le64		dma_addr;
+	__le32		length;
+	__le32		pd_id;
+	__le32		id_ver;
+	__le16		dbid_flags;
+	__u8		csum_profile;
+	__u8		crypto;
+} __packed;
+
+#define IONIC_ADMIN_CREATE_AH_IN_V1_LEN 24
+static_assert(sizeof(struct ionic_admin_create_ah) ==
+	       IONIC_ADMIN_CREATE_AH_IN_V1_LEN);
+
+struct ionic_admin_destroy_ah {
+	__le32		ah_id;
+} __packed;
+
+#define IONIC_ADMIN_DESTROY_AH_IN_V1_LEN 4
+static_assert(sizeof(struct ionic_admin_destroy_ah) ==
+	       IONIC_ADMIN_DESTROY_AH_IN_V1_LEN);
+
+struct ionic_admin_query_ah {
+	__le64		dma_addr;
+} __packed;
+
+#define IONIC_ADMIN_QUERY_AH_IN_V1_LEN 8
+static_assert(sizeof(struct ionic_admin_query_ah) ==
+	       IONIC_ADMIN_QUERY_AH_IN_V1_LEN);
+
+struct ionic_admin_create_mr {
+	__le64		va;
+	__le64		length;
+	__le32		pd_id;
+	__le32		id_ver;
+	__le32		tbl_index;
+	__le32		map_count;
+	__le64		dma_addr;
+	__le16		dbid_flags;
+	__u8		pt_type;
+	__u8		dir_size_log2;
+	__u8		page_size_log2;
+} __packed;
+
+#define IONIC_ADMIN_CREATE_MR_IN_V1_LEN 45
+static_assert(sizeof(struct ionic_admin_create_mr) ==
+	       IONIC_ADMIN_CREATE_MR_IN_V1_LEN);
+
+struct ionic_admin_destroy_mr {
+	__le32		mr_id;
+} __packed;
+
+#define IONIC_ADMIN_DESTROY_MR_IN_V1_LEN 4
+static_assert(sizeof(struct ionic_admin_destroy_mr) ==
+	       IONIC_ADMIN_DESTROY_MR_IN_V1_LEN);
+
+struct ionic_admin_create_cq {
+	__le32		eq_id;
+	__u8		depth_log2;
+	__u8		stride_log2;
+	__u8		dir_size_log2_rsvd;
+	__u8		page_size_log2;
+	__le32		cq_flags;
+	__le32		id_ver;
+	__le32		tbl_index;
+	__le32		map_count;
+	__le64		dma_addr;
+	__le16		dbid_flags;
+} __packed;
+
+#define IONIC_ADMIN_CREATE_CQ_IN_V1_LEN 34
+static_assert(sizeof(struct ionic_admin_create_cq) ==
+	       IONIC_ADMIN_CREATE_CQ_IN_V1_LEN);
+
+struct ionic_admin_destroy_cq {
+	__le32		cq_id;
+} __packed;
+
+#define IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN 4
+static_assert(sizeof(struct ionic_admin_destroy_cq) ==
+	       IONIC_ADMIN_DESTROY_CQ_IN_V1_LEN);
+
+struct ionic_admin_create_qp {
+	__le32		pd_id;
+	__be32		priv_flags;
+	__le32		sq_cq_id;
+	__u8		sq_depth_log2;
+	__u8		sq_stride_log2;
+	__u8		sq_dir_size_log2_rsvd;
+	__u8		sq_page_size_log2;
+	__le32		sq_tbl_index_xrcd_id;
+	__le32		sq_map_count;
+	__le64		sq_dma_addr;
+	__le32		rq_cq_id;
+	__u8		rq_depth_log2;
+	__u8		rq_stride_log2;
+	__u8		rq_dir_size_log2_rsvd;
+	__u8		rq_page_size_log2;
+	__le32		rq_tbl_index_srq_id;
+	__le32		rq_map_count;
+	__le64		rq_dma_addr;
+	__le32		id_ver;
+	__le16		dbid_flags;
+	__u8		type_state;
+	__u8		rsvd;
+} __packed;
+
+#define IONIC_ADMIN_CREATE_QP_IN_V1_LEN 64
+static_assert(sizeof(struct ionic_admin_create_qp) ==
+	       IONIC_ADMIN_CREATE_QP_IN_V1_LEN);
+
+struct ionic_admin_destroy_qp {
+	__le32		qp_id;
+} __packed;
+
+#define IONIC_ADMIN_DESTROY_QP_IN_V1_LEN 4
+static_assert(sizeof(struct ionic_admin_destroy_qp) ==
+	       IONIC_ADMIN_DESTROY_QP_IN_V1_LEN);
+
+struct ionic_admin_mod_qp {
+	__be32		attr_mask;
+	__u8		dcqcn_profile;
+	__u8		tfp_csum_profile;
+	__be16		access_flags;
+	__le32		rq_psn;
+	__le32		sq_psn;
+	__le32		qkey_dest_qpn;
+	__le32		rate_limit_kbps;
+	__u8		pmtu;
+	__u8		retry;
+	__u8		rnr_timer;
+	__u8		retry_timeout;
+	__u8		rsq_depth;
+	__u8		rrq_depth;
+	__le16		pkey_id;
+	__le32		ah_id_len;
+	__u8		en_pcp;
+	__u8		ip_dscp;
+	__u8		rsvd2;
+	__u8		type_state;
+	union {
+		struct {
+			__le16		rsvd1;
+		};
+		__le32		rrq_index;
+	};
+	__le32		rsq_index;
+	__le64		dma_addr;
+	__le32		id_ver;
+} __packed;
+
+#define IONIC_ADMIN_MODIFY_QP_IN_V1_LEN 60
+static_assert(sizeof(struct ionic_admin_mod_qp) ==
+	       IONIC_ADMIN_MODIFY_QP_IN_V1_LEN);
+
+struct ionic_admin_query_qp {
+	__le64		hdr_dma_addr;
+	__le64		sq_dma_addr;
+	__le64		rq_dma_addr;
+	__le32		ah_id;
+	__le32		id_ver;
+	__le16		dbid_flags;
+} __packed;
+
+#define IONIC_ADMIN_QUERY_QP_IN_V1_LEN 34
+static_assert(sizeof(struct ionic_admin_query_qp) ==
+	       IONIC_ADMIN_QUERY_QP_IN_V1_LEN);
+
+#define ADMIN_WQE_STRIDE	64
+#define ADMIN_WQE_HDR_LEN	4
+
+/* admin queue v1 wqe */
+struct ionic_v1_admin_wqe {
+	__u8				op;
+	__u8				rsvd;
+	__le16				len;
+
+	union {
+		struct ionic_admin_stats_hdr stats;
+		struct ionic_admin_create_ah create_ah;
+		struct ionic_admin_destroy_ah destroy_ah;
+		struct ionic_admin_query_ah query_ah;
+		struct ionic_admin_create_mr create_mr;
+		struct ionic_admin_destroy_mr destroy_mr;
+		struct ionic_admin_create_cq create_cq;
+		struct ionic_admin_destroy_cq destroy_cq;
+		struct ionic_admin_create_qp create_qp;
+		struct ionic_admin_destroy_qp destroy_qp;
+		struct ionic_admin_mod_qp mod_qp;
+		struct ionic_admin_query_qp query_qp;
+	} cmd;
+};
+
+/* side data for query qp */
+struct ionic_v1_admin_query_qp_sq {
+	__u8				rnr_timer;
+	__u8				retry_timeout;
+	__be16				access_perms_flags;
+	__be16				rsvd;
+	__be16				pkey_id;
+	__be32				qkey_dest_qpn;
+	__be32				rate_limit_kbps;
+	__be32				rq_psn;
+};
+
+struct ionic_v1_admin_query_qp_rq {
+	__u8				state_pmtu;
+	__u8				retry_rnrtry;
+	__u8				rrq_depth;
+	__u8				rsq_depth;
+	__be32				sq_psn;
+	__be16				access_perms_flags;
+	__be16				rsvd;
+};
+
+/* admin queue v1 opcodes */
+enum ionic_v1_admin_op {
+	IONIC_V1_ADMIN_NOOP,
+	IONIC_V1_ADMIN_CREATE_CQ,
+	IONIC_V1_ADMIN_CREATE_QP,
+	IONIC_V1_ADMIN_CREATE_MR,
+	IONIC_V1_ADMIN_STATS_HDRS,
+	IONIC_V1_ADMIN_STATS_VALS,
+	IONIC_V1_ADMIN_DESTROY_MR,
+	IONIC_V1_ADMIN_RSVD_7,		/* RESIZE_CQ */
+	IONIC_V1_ADMIN_DESTROY_CQ,
+	IONIC_V1_ADMIN_MODIFY_QP,
+	IONIC_V1_ADMIN_QUERY_QP,
+	IONIC_V1_ADMIN_DESTROY_QP,
+	IONIC_V1_ADMIN_DEBUG,
+	IONIC_V1_ADMIN_CREATE_AH,
+	IONIC_V1_ADMIN_QUERY_AH,
+	IONIC_V1_ADMIN_MODIFY_DCQCN,
+	IONIC_V1_ADMIN_DESTROY_AH,
+	IONIC_V1_ADMIN_QP_STATS_HDRS,
+	IONIC_V1_ADMIN_QP_STATS_VALS,
+	IONIC_V1_ADMIN_OPCODES_MAX,
+};
+
+/* admin queue v1 cqe status */
+enum ionic_v1_admin_status {
+	IONIC_V1_ASTS_OK,
+	IONIC_V1_ASTS_BAD_CMD,
+	IONIC_V1_ASTS_BAD_INDEX,
+	IONIC_V1_ASTS_BAD_STATE,
+	IONIC_V1_ASTS_BAD_TYPE,
+	IONIC_V1_ASTS_BAD_ATTR,
+	IONIC_V1_ASTS_MSG_TOO_BIG,
+};
+
+/* event queue v1 eqe */
+struct ionic_v1_eqe {
+	__be32				evt;
+};
+
+/* bits for cqe queue_type_flags */
+enum ionic_v1_eqe_evt_bits {
+	IONIC_V1_EQE_COLOR		= BIT(0),
+	IONIC_V1_EQE_TYPE_SHIFT		= 1,
+	IONIC_V1_EQE_TYPE_MASK		= 0x7,
+	IONIC_V1_EQE_CODE_SHIFT		= 4,
+	IONIC_V1_EQE_CODE_MASK		= 0xf,
+	IONIC_V1_EQE_QID_SHIFT		= 8,
+
+	/* cq events */
+	IONIC_V1_EQE_TYPE_CQ		= 0,
+	/* cq normal events */
+	IONIC_V1_EQE_CQ_NOTIFY		= 0,
+	/* cq error events */
+	IONIC_V1_EQE_CQ_ERR		= 8,
+
+	/* qp and srq events */
+	IONIC_V1_EQE_TYPE_QP		= 1,
+	/* qp normal events */
+	IONIC_V1_EQE_SRQ_LEVEL		= 0,
+	IONIC_V1_EQE_SQ_DRAIN		= 1,
+	IONIC_V1_EQE_QP_COMM_EST	= 2,
+	IONIC_V1_EQE_QP_LAST_WQE	= 3,
+	/* qp error events */
+	IONIC_V1_EQE_QP_ERR		= 8,
+	IONIC_V1_EQE_QP_ERR_REQUEST	= 9,
+	IONIC_V1_EQE_QP_ERR_ACCESS	= 10,
+};
+
+enum ionic_tfp_csum_profiles {
+	IONIC_TFP_CSUM_PROF_ETH_IPV4_UDP				= 0,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP				= 1,
+	IONIC_TFP_CSUM_PROF_ETH_IPV6_UDP				= 2,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_UDP				= 3,
+	IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP		= 4,
+	IONIC_TFP_CSUM_PROF_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP		= 5,
+	IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV4_UDP	= 6,
+	IONIC_TFP_CSUM_PROF_QTAG_IPV4_UDP_VXLAN_ETH_QTAG_IPV6_UDP	= 7,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_IPV4_UDP		= 8,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_ESP_UDP			= 9,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_ESP_UDP			= 10,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV6_ESP_UDP			= 11,
+	IONIC_TFP_CSUM_PROF_ETH_QTAG_IPV4_UDP_CSUM			= 12,
+};
+
+static inline bool ionic_v1_eqe_color(struct ionic_v1_eqe *eqe)
+{
+	return eqe->evt & cpu_to_be32(IONIC_V1_EQE_COLOR);
+}
+
+static inline u32 ionic_v1_eqe_evt(struct ionic_v1_eqe *eqe)
+{
+	return be32_to_cpu(eqe->evt);
+}
+
+static inline u8 ionic_v1_eqe_evt_type(u32 evt)
+{
+	return (evt >> IONIC_V1_EQE_TYPE_SHIFT) & IONIC_V1_EQE_TYPE_MASK;
+}
+
+static inline u8 ionic_v1_eqe_evt_code(u32 evt)
+{
+	return (evt >> IONIC_V1_EQE_CODE_SHIFT) & IONIC_V1_EQE_CODE_MASK;
+}
+
+static inline u32 ionic_v1_eqe_evt_qid(u32 evt)
+{
+	return evt >> IONIC_V1_EQE_QID_SHIFT;
+}
+
+enum ionic_v1_stat_bits {
+	IONIC_V1_STAT_TYPE_SHIFT	= 28,
+	IONIC_V1_STAT_TYPE_NONE		= 0,
+	IONIC_V1_STAT_TYPE_8		= 1,
+	IONIC_V1_STAT_TYPE_LE16		= 2,
+	IONIC_V1_STAT_TYPE_LE32		= 3,
+	IONIC_V1_STAT_TYPE_LE64		= 4,
+	IONIC_V1_STAT_TYPE_BE16		= 5,
+	IONIC_V1_STAT_TYPE_BE32		= 6,
+	IONIC_V1_STAT_TYPE_BE64		= 7,
+	IONIC_V1_STAT_OFF_MASK		= BIT(IONIC_V1_STAT_TYPE_SHIFT) - 1,
+};
+
+struct ionic_v1_stat {
+	union {
+		__be32		be_type_off;
+		u32		type_off;
+	};
+	char			name[28];
+};
+
+static inline int ionic_v1_stat_type(struct ionic_v1_stat *hdr)
+{
+	return hdr->type_off >> IONIC_V1_STAT_TYPE_SHIFT;
+}
+
+static inline unsigned int ionic_v1_stat_off(struct ionic_v1_stat *hdr)
+{
+	return hdr->type_off & IONIC_V1_STAT_OFF_MASK;
+}
+
+#endif /* _IONIC_FW_H_ */
diff --git a/drivers/infiniband/hw/ionic/ionic_hw_stats.c b/drivers/infiniband/hw/ionic/ionic_hw_stats.c
new file mode 100644
index 000000000000..244a80dde08f
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_hw_stats.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/dma-mapping.h>
+
+#include "ionic_fw.h"
+#include "ionic_ibdev.h"
+
+static int ionic_v1_stat_normalize(struct ionic_v1_stat *hw_stats,
+				   int hw_stats_count)
+{
+	int hw_stat_i;
+
+	for (hw_stat_i = 0; hw_stat_i < hw_stats_count; ++hw_stat_i) {
+		struct ionic_v1_stat *stat = &hw_stats[hw_stat_i];
+
+		stat->type_off = be32_to_cpu(stat->be_type_off);
+		stat->name[sizeof(stat->name) - 1] = 0;
+		if (ionic_v1_stat_type(stat) == IONIC_V1_STAT_TYPE_NONE)
+			break;
+	}
+
+	return hw_stat_i;
+}
+
+static void ionic_fill_stats_desc(struct rdma_stat_desc *hw_stats_hdrs,
+				  struct ionic_v1_stat *hw_stats,
+				  int hw_stats_count)
+{
+	int hw_stat_i;
+
+	for (hw_stat_i = 0; hw_stat_i < hw_stats_count; ++hw_stat_i) {
+		struct ionic_v1_stat *stat = &hw_stats[hw_stat_i];
+
+		hw_stats_hdrs[hw_stat_i].name = stat->name;
+	}
+}
+
+static u64 ionic_v1_stat_val(struct ionic_v1_stat *stat,
+			     void *vals_buf, size_t vals_len)
+{
+	unsigned int off = ionic_v1_stat_off(stat);
+	int type = ionic_v1_stat_type(stat);
+
+#define __ionic_v1_stat_validate(__type)		\
+	((off + sizeof(__type) <= vals_len) &&		\
+	 (IS_ALIGNED(off, sizeof(__type))))
+
+	switch (type) {
+	case IONIC_V1_STAT_TYPE_8:
+		if (__ionic_v1_stat_validate(u8))
+			return *(u8 *)(vals_buf + off);
+		break;
+	case IONIC_V1_STAT_TYPE_LE16:
+		if (__ionic_v1_stat_validate(__le16))
+			return le16_to_cpu(*(__le16 *)(vals_buf + off));
+		break;
+	case IONIC_V1_STAT_TYPE_LE32:
+		if (__ionic_v1_stat_validate(__le32))
+			return le32_to_cpu(*(__le32 *)(vals_buf + off));
+		break;
+	case IONIC_V1_STAT_TYPE_LE64:
+		if (__ionic_v1_stat_validate(__le64))
+			return le64_to_cpu(*(__le64 *)(vals_buf + off));
+		break;
+	case IONIC_V1_STAT_TYPE_BE16:
+		if (__ionic_v1_stat_validate(__be16))
+			return be16_to_cpu(*(__be16 *)(vals_buf + off));
+		break;
+	case IONIC_V1_STAT_TYPE_BE32:
+		if (__ionic_v1_stat_validate(__be32))
+			return be32_to_cpu(*(__be32 *)(vals_buf + off));
+		break;
+	case IONIC_V1_STAT_TYPE_BE64:
+		if (__ionic_v1_stat_validate(__be64))
+			return be64_to_cpu(*(__be64 *)(vals_buf + off));
+		break;
+	}
+
+	return ~0ull;
+#undef __ionic_v1_stat_validate
+}
+
+static int ionic_hw_stats_cmd(struct ionic_ibdev *dev,
+			      dma_addr_t dma, size_t len, int qid, int op)
+{
+	struct ionic_admin_wr wr = {
+		.work = COMPLETION_INITIALIZER_ONSTACK(wr.work),
+		.wqe = {
+			.op = op,
+			.len = cpu_to_le16(IONIC_ADMIN_STATS_HDRS_IN_V1_LEN),
+			.cmd.stats = {
+				.dma_addr = cpu_to_le64(dma),
+				.length = cpu_to_le32(len),
+				.id_ver = cpu_to_le32(qid),
+			},
+		}
+	};
+
+	if (dev->lif_cfg.admin_opcodes <= op)
+		return -EBADRQC;
+
+	ionic_admin_post(dev, &wr);
+
+	return ionic_admin_wait(dev, &wr, IONIC_ADMIN_F_INTERRUPT);
+}
+
+static int ionic_init_hw_stats(struct ionic_ibdev *dev)
+{
+	dma_addr_t hw_stats_dma;
+	int rc, hw_stats_count;
+
+	if (dev->hw_stats_hdrs)
+		return 0;
+
+	dev->hw_stats_count = 0;
+
+	/* buffer for current values from the device */
+	dev->hw_stats_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!dev->hw_stats_buf) {
+		rc = -ENOMEM;
+		goto err_buf;
+	}
+
+	/* buffer for names, sizes, offsets of values */
+	dev->hw_stats = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!dev->hw_stats) {
+		rc = -ENOMEM;
+		goto err_hw_stats;
+	}
+
+	/* request the names, sizes, offsets */
+	hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, dev->hw_stats,
+				      PAGE_SIZE, DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma);
+	if (rc)
+		goto err_dma;
+
+	rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE, 0,
+				IONIC_V1_ADMIN_STATS_HDRS);
+	if (rc)
+		goto err_cmd;
+
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+
+	/* normalize and count the number of hw_stats */
+	hw_stats_count =
+		ionic_v1_stat_normalize(dev->hw_stats,
+					PAGE_SIZE / sizeof(*dev->hw_stats));
+	if (!hw_stats_count) {
+		rc = -ENODATA;
+		goto err_dma;
+	}
+
+	dev->hw_stats_count = hw_stats_count;
+
+	/* alloc and init array of names, for alloc_hw_stats */
+	dev->hw_stats_hdrs = kcalloc(hw_stats_count,
+				     sizeof(*dev->hw_stats_hdrs),
+				     GFP_KERNEL);
+	if (!dev->hw_stats_hdrs) {
+		rc = -ENOMEM;
+		goto err_dma;
+	}
+
+	ionic_fill_stats_desc(dev->hw_stats_hdrs, dev->hw_stats,
+			      hw_stats_count);
+
+	return 0;
+
+err_cmd:
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+err_dma:
+	kfree(dev->hw_stats);
+err_hw_stats:
+	kfree(dev->hw_stats_buf);
+err_buf:
+	dev->hw_stats_count = 0;
+	dev->hw_stats = NULL;
+	dev->hw_stats_buf = NULL;
+	dev->hw_stats_hdrs = NULL;
+	return rc;
+}
+
+static struct rdma_hw_stats *ionic_alloc_hw_stats(struct ib_device *ibdev,
+						  u32 port)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+
+	if (port != 1)
+		return NULL;
+
+	return rdma_alloc_hw_stats_struct(dev->hw_stats_hdrs,
+					  dev->hw_stats_count,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int ionic_get_hw_stats(struct ib_device *ibdev,
+			      struct rdma_hw_stats *hw_stats,
+			      u32 port, int index)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+	dma_addr_t hw_stats_dma;
+	int rc, hw_stat_i;
+
+	if (port != 1)
+		return -EINVAL;
+
+	hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, dev->hw_stats_buf,
+				      PAGE_SIZE, DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma);
+	if (rc)
+		goto err_dma;
+
+	rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE,
+				0, IONIC_V1_ADMIN_STATS_VALS);
+	if (rc)
+		goto err_cmd;
+
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma,
+			 PAGE_SIZE, DMA_FROM_DEVICE);
+
+	for (hw_stat_i = 0; hw_stat_i < dev->hw_stats_count; ++hw_stat_i)
+		hw_stats->value[hw_stat_i] =
+			ionic_v1_stat_val(&dev->hw_stats[hw_stat_i],
+					  dev->hw_stats_buf, PAGE_SIZE);
+
+	return hw_stat_i;
+
+err_cmd:
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma,
+			 PAGE_SIZE, DMA_FROM_DEVICE);
+err_dma:
+	return rc;
+}
+
+static struct rdma_hw_stats *
+ionic_counter_alloc_stats(struct rdma_counter *counter)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(counter->device);
+	struct ionic_counter *cntr;
+	int err;
+
+	cntr = kzalloc(sizeof(*cntr), GFP_KERNEL);
+	if (!cntr)
+		return NULL;
+
+	/* buffer for current values from the device */
+	cntr->vals = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!cntr->vals)
+		goto err_vals;
+
+	err = xa_alloc(&dev->counter_stats->xa_counters, &counter->id,
+		       cntr,
+		       XA_LIMIT(0, IONIC_MAX_QPID),
+		       GFP_KERNEL);
+	if (err)
+		goto err_xa;
+
+	INIT_LIST_HEAD(&cntr->qp_list);
+
+	return rdma_alloc_hw_stats_struct(dev->counter_stats->stats_hdrs,
+					 dev->counter_stats->queue_stats_count,
+					 RDMA_HW_STATS_DEFAULT_LIFESPAN);
+err_xa:
+	kfree(cntr->vals);
+err_vals:
+	kfree(cntr);
+
+	return NULL;
+}
+
+static int ionic_counter_dealloc(struct rdma_counter *counter)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(counter->device);
+	struct ionic_counter *cntr;
+
+	cntr = xa_erase(&dev->counter_stats->xa_counters, counter->id);
+	if (!cntr)
+		return -EINVAL;
+
+	kfree(cntr->vals);
+	kfree(cntr);
+
+	return 0;
+}
+
+static int ionic_counter_bind_qp(struct rdma_counter *counter,
+				 struct ib_qp *ibqp,
+				 u32 port)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(counter->device);
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+	struct ionic_counter *cntr;
+
+	cntr = xa_load(&dev->counter_stats->xa_counters, counter->id);
+	if (!cntr)
+		return -EINVAL;
+
+	list_add_tail(&qp->qp_list_counter, &cntr->qp_list);
+	ibqp->counter = counter;
+
+	return 0;
+}
+
+static int ionic_counter_unbind_qp(struct ib_qp *ibqp, u32 port)
+{
+	struct ionic_qp *qp = to_ionic_qp(ibqp);
+
+	if (ibqp->counter) {
+		list_del(&qp->qp_list_counter);
+		ibqp->counter = NULL;
+	}
+
+	return 0;
+}
+
+static int ionic_get_qp_stats(struct ib_device *ibdev,
+			      struct rdma_hw_stats *hw_stats,
+			      u32 counter_id)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+	struct ionic_counter_stats *cs;
+	struct ionic_counter *cntr;
+	dma_addr_t hw_stats_dma;
+	struct ionic_qp *qp;
+	int rc, stat_i = 0;
+
+	cs = dev->counter_stats;
+	cntr = xa_load(&cs->xa_counters, counter_id);
+	if (!cntr)
+		return -EINVAL;
+
+	hw_stats_dma = dma_map_single(dev->lif_cfg.hwdev, cntr->vals,
+				      PAGE_SIZE, DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, hw_stats_dma);
+	if (rc)
+		return rc;
+
+	memset(hw_stats->value, 0, sizeof(u64) * hw_stats->num_counters);
+
+	list_for_each_entry(qp, &cntr->qp_list, qp_list_counter) {
+		rc = ionic_hw_stats_cmd(dev, hw_stats_dma, PAGE_SIZE,
+					qp->qpid,
+					IONIC_V1_ADMIN_QP_STATS_VALS);
+		if (rc)
+			goto err_cmd;
+
+		for (stat_i = 0; stat_i < cs->queue_stats_count; ++stat_i)
+			hw_stats->value[stat_i] +=
+				ionic_v1_stat_val(&cs->hdr[stat_i],
+						  cntr->vals,
+						  PAGE_SIZE);
+	}
+
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+	return stat_i;
+
+err_cmd:
+	dma_unmap_single(dev->lif_cfg.hwdev, hw_stats_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+
+	return rc;
+}
+
+static int ionic_counter_update_stats(struct rdma_counter *counter)
+{
+	return ionic_get_qp_stats(counter->device, counter->stats, counter->id);
+}
+
+static int ionic_alloc_counters(struct ionic_ibdev *dev)
+{
+	struct ionic_counter_stats *cs = dev->counter_stats;
+	int rc, hw_stats_count;
+	dma_addr_t hdr_dma;
+
+	/* buffer for names, sizes, offsets of values */
+	cs->hdr = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!cs->hdr)
+		return -ENOMEM;
+
+	hdr_dma = dma_map_single(dev->lif_cfg.hwdev, cs->hdr,
+				 PAGE_SIZE, DMA_FROM_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, hdr_dma);
+	if (rc)
+		goto err_dma;
+
+	rc = ionic_hw_stats_cmd(dev, hdr_dma, PAGE_SIZE, 0,
+				IONIC_V1_ADMIN_QP_STATS_HDRS);
+	if (rc)
+		goto err_cmd;
+
+	dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+
+	/* normalize and count the number of hw_stats */
+	hw_stats_count = ionic_v1_stat_normalize(cs->hdr,
+						 PAGE_SIZE / sizeof(*cs->hdr));
+	if (!hw_stats_count) {
+		rc = -ENODATA;
+		goto err_dma;
+	}
+
+	cs->queue_stats_count = hw_stats_count;
+
+	/* alloc and init array of names */
+	cs->stats_hdrs = kcalloc(hw_stats_count, sizeof(*cs->stats_hdrs),
+				 GFP_KERNEL);
+	if (!cs->stats_hdrs) {
+		rc = -ENOMEM;
+		goto err_dma;
+	}
+
+	ionic_fill_stats_desc(cs->stats_hdrs, cs->hdr, hw_stats_count);
+
+	return 0;
+
+err_cmd:
+	dma_unmap_single(dev->lif_cfg.hwdev, hdr_dma, PAGE_SIZE, DMA_FROM_DEVICE);
+err_dma:
+	kfree(cs->hdr);
+
+	return rc;
+}
+
+static const struct ib_device_ops ionic_hw_stats_ops = {
+	.driver_id = RDMA_DRIVER_IONIC,
+	.alloc_hw_port_stats = ionic_alloc_hw_stats,
+	.get_hw_stats = ionic_get_hw_stats,
+};
+
+static const struct ib_device_ops ionic_counter_stats_ops = {
+	.counter_alloc_stats = ionic_counter_alloc_stats,
+	.counter_dealloc = ionic_counter_dealloc,
+	.counter_bind_qp = ionic_counter_bind_qp,
+	.counter_unbind_qp = ionic_counter_unbind_qp,
+	.counter_update_stats = ionic_counter_update_stats,
+};
+
+void ionic_stats_init(struct ionic_ibdev *dev)
+{
+	u16 stats_type = dev->lif_cfg.stats_type;
+	int rc;
+
+	if (stats_type & IONIC_LIF_RDMA_STAT_GLOBAL) {
+		rc = ionic_init_hw_stats(dev);
+		if (rc)
+			ibdev_dbg(&dev->ibdev, "Failed to init hw stats\n");
+		else
+			ib_set_device_ops(&dev->ibdev, &ionic_hw_stats_ops);
+	}
+
+	if (stats_type & IONIC_LIF_RDMA_STAT_QP) {
+		dev->counter_stats = kzalloc(sizeof(*dev->counter_stats),
+					     GFP_KERNEL);
+		if (!dev->counter_stats)
+			return;
+
+		rc = ionic_alloc_counters(dev);
+		if (rc) {
+			ibdev_dbg(&dev->ibdev, "Failed to init counter stats\n");
+			kfree(dev->counter_stats);
+			dev->counter_stats = NULL;
+			return;
+		}
+
+		xa_init_flags(&dev->counter_stats->xa_counters, XA_FLAGS_ALLOC);
+
+		ib_set_device_ops(&dev->ibdev, &ionic_counter_stats_ops);
+	}
+}
+
+void ionic_stats_cleanup(struct ionic_ibdev *dev)
+{
+	if (dev->counter_stats) {
+		xa_destroy(&dev->counter_stats->xa_counters);
+		kfree(dev->counter_stats->hdr);
+		kfree(dev->counter_stats->stats_hdrs);
+		kfree(dev->counter_stats);
+		dev->counter_stats = NULL;
+	}
+
+	kfree(dev->hw_stats);
+	kfree(dev->hw_stats_buf);
+	kfree(dev->hw_stats_hdrs);
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c
new file mode 100644
index 000000000000..164046d00e5d
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <net/addrconf.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include "ionic_ibdev.h"
+
+#define DRIVER_DESCRIPTION "AMD Pensando RoCE HCA driver"
+#define DEVICE_DESCRIPTION "AMD Pensando RoCE HCA"
+
+MODULE_AUTHOR("Allen Hubbe <allen.hubbe@amd.com>");
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS("NET_IONIC");
+
+static int ionic_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *attr,
+			      struct ib_udata *udata)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+	struct net_device *ndev;
+
+	ndev = ib_device_get_netdev(ibdev, 1);
+	addrconf_ifid_eui48((u8 *)&attr->sys_image_guid, ndev);
+	dev_put(ndev);
+	attr->max_mr_size = dev->lif_cfg.npts_per_lif * PAGE_SIZE / 2;
+	attr->page_size_cap = dev->lif_cfg.page_size_supported;
+
+	attr->vendor_id = to_pci_dev(dev->lif_cfg.hwdev)->vendor;
+	attr->vendor_part_id = to_pci_dev(dev->lif_cfg.hwdev)->device;
+
+	attr->hw_ver = ionic_lif_asic_rev(dev->lif_cfg.lif);
+	attr->fw_ver = 0;
+	attr->max_qp = dev->lif_cfg.qp_count;
+	attr->max_qp_wr = IONIC_MAX_DEPTH;
+	attr->device_cap_flags =
+		IB_DEVICE_MEM_WINDOW |
+		IB_DEVICE_MEM_MGT_EXTENSIONS |
+		IB_DEVICE_MEM_WINDOW_TYPE_2B |
+		0;
+	attr->max_send_sge =
+		min(ionic_v1_send_wqe_max_sge(dev->lif_cfg.max_stride, 0, false),
+		    IONIC_SPEC_HIGH);
+	attr->max_recv_sge =
+		min(ionic_v1_recv_wqe_max_sge(dev->lif_cfg.max_stride, 0, false),
+		    IONIC_SPEC_HIGH);
+	attr->max_sge_rd = attr->max_send_sge;
+	attr->max_cq = dev->lif_cfg.cq_count / dev->lif_cfg.udma_count;
+	attr->max_cqe = IONIC_MAX_CQ_DEPTH - IONIC_CQ_GRACE;
+	attr->max_mr = dev->lif_cfg.nmrs_per_lif;
+	attr->max_pd = IONIC_MAX_PD;
+	attr->max_qp_rd_atom = IONIC_MAX_RD_ATOM;
+	attr->max_ee_rd_atom = 0;
+	attr->max_res_rd_atom = IONIC_MAX_RD_ATOM;
+	attr->max_qp_init_rd_atom = IONIC_MAX_RD_ATOM;
+	attr->max_ee_init_rd_atom = 0;
+	attr->atomic_cap = IB_ATOMIC_GLOB;
+	attr->masked_atomic_cap = IB_ATOMIC_GLOB;
+	attr->max_mw = dev->lif_cfg.nmrs_per_lif;
+	attr->max_mcast_grp = 0;
+	attr->max_mcast_qp_attach = 0;
+	attr->max_ah = dev->lif_cfg.nahs_per_lif;
+	attr->max_fast_reg_page_list_len = dev->lif_cfg.npts_per_lif / 2;
+	attr->max_pkeys = IONIC_PKEY_TBL_LEN;
+
+	return 0;
+}
+
+static int ionic_query_port(struct ib_device *ibdev, u32 port,
+			    struct ib_port_attr *attr)
+{
+	struct net_device *ndev;
+
+	if (port != 1)
+		return -EINVAL;
+
+	ndev = ib_device_get_netdev(ibdev, port);
+
+	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
+		attr->state = IB_PORT_ACTIVE;
+		attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+	} else if (netif_running(ndev)) {
+		attr->state = IB_PORT_DOWN;
+		attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
+	} else {
+		attr->state = IB_PORT_DOWN;
+		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+	}
+
+	attr->max_mtu = iboe_get_mtu(ndev->max_mtu);
+	attr->active_mtu = min(attr->max_mtu, iboe_get_mtu(ndev->mtu));
+	attr->gid_tbl_len = IONIC_GID_TBL_LEN;
+	attr->ip_gids = true;
+	attr->port_cap_flags = 0;
+	attr->max_msg_sz = 0x80000000;
+	attr->pkey_tbl_len = IONIC_PKEY_TBL_LEN;
+	attr->max_vl_num = 1;
+	attr->subnet_prefix = 0xfe80000000000000ull;
+
+	dev_put(ndev);
+
+	return ib_get_eth_speed(ibdev, port,
+				&attr->active_speed,
+				&attr->active_width);
+}
+
+static enum rdma_link_layer ionic_get_link_layer(struct ib_device *ibdev,
+						 u32 port)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int ionic_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
+			    u16 *pkey)
+{
+	if (port != 1)
+		return -EINVAL;
+
+	if (index != 0)
+		return -EINVAL;
+
+	*pkey = IB_DEFAULT_PKEY_FULL;
+
+	return 0;
+}
+
+static int ionic_modify_device(struct ib_device *ibdev, int mask,
+			       struct ib_device_modify *attr)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC)
+		memcpy(dev->ibdev.node_desc, attr->node_desc,
+		       IB_DEVICE_NODE_DESC_MAX);
+
+	return 0;
+}
+
+static int ionic_get_port_immutable(struct ib_device *ibdev, u32 port,
+				    struct ib_port_immutable *attr)
+{
+	if (port != 1)
+		return -EINVAL;
+
+	attr->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+
+	attr->pkey_tbl_len = IONIC_PKEY_TBL_LEN;
+	attr->gid_tbl_len = IONIC_GID_TBL_LEN;
+	attr->max_mad_size = IB_MGMT_MAD_SIZE;
+
+	return 0;
+}
+
+static void ionic_get_dev_fw_str(struct ib_device *ibdev, char *str)
+{
+	struct ionic_ibdev *dev = to_ionic_ibdev(ibdev);
+
+	ionic_lif_fw_version(dev->lif_cfg.lif, str, IB_FW_VERSION_NAME_MAX);
+}
+
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct ionic_ibdev *dev =
+		rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev);
+
+	return sysfs_emit(buf, "0x%x\n", ionic_lif_asic_rev(dev->lif_cfg.lif));
+}
+static DEVICE_ATTR_RO(hw_rev);
+
+static ssize_t hca_type_show(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct ionic_ibdev *dev =
+		rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev);
+
+	return sysfs_emit(buf, "%s\n", dev->ibdev.node_desc);
+}
+static DEVICE_ATTR_RO(hca_type);
+
+static struct attribute *ionic_rdma_attributes[] = {
+	&dev_attr_hw_rev.attr,
+	&dev_attr_hca_type.attr,
+	NULL
+};
+
+static const struct attribute_group ionic_rdma_attr_group = {
+	.attrs = ionic_rdma_attributes,
+};
+
+static void ionic_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+	/*
+	 * Dummy define disassociate_ucontext so that it does not
+	 * wait for user context before cleaning up hw resources.
+	 */
+}
+
+static const struct ib_device_ops ionic_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_IONIC,
+	.uverbs_abi_ver = IONIC_ABI_VERSION,
+
+	.alloc_ucontext = ionic_alloc_ucontext,
+	.dealloc_ucontext = ionic_dealloc_ucontext,
+	.mmap = ionic_mmap,
+	.mmap_free = ionic_mmap_free,
+	.alloc_pd = ionic_alloc_pd,
+	.dealloc_pd = ionic_dealloc_pd,
+	.create_ah = ionic_create_ah,
+	.query_ah = ionic_query_ah,
+	.destroy_ah = ionic_destroy_ah,
+	.create_user_ah = ionic_create_ah,
+	.get_dma_mr = ionic_get_dma_mr,
+	.reg_user_mr = ionic_reg_user_mr,
+	.reg_user_mr_dmabuf = ionic_reg_user_mr_dmabuf,
+	.dereg_mr = ionic_dereg_mr,
+	.alloc_mr = ionic_alloc_mr,
+	.map_mr_sg = ionic_map_mr_sg,
+	.alloc_mw = ionic_alloc_mw,
+	.dealloc_mw = ionic_dealloc_mw,
+	.create_cq = ionic_create_cq,
+	.destroy_cq = ionic_destroy_cq,
+	.create_qp = ionic_create_qp,
+	.modify_qp = ionic_modify_qp,
+	.query_qp = ionic_query_qp,
+	.destroy_qp = ionic_destroy_qp,
+
+	.post_send = ionic_post_send,
+	.post_recv = ionic_post_recv,
+	.poll_cq = ionic_poll_cq,
+	.req_notify_cq = ionic_req_notify_cq,
+
+	.query_device = ionic_query_device,
+	.query_port = ionic_query_port,
+	.get_link_layer = ionic_get_link_layer,
+	.query_pkey = ionic_query_pkey,
+	.modify_device = ionic_modify_device,
+	.get_port_immutable = ionic_get_port_immutable,
+	.get_dev_fw_str = ionic_get_dev_fw_str,
+	.device_group = &ionic_rdma_attr_group,
+	.disassociate_ucontext = ionic_disassociate_ucontext,
+
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, ionic_ctx, ibctx),
+	INIT_RDMA_OBJ_SIZE(ib_pd, ionic_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ah, ionic_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_cq, ionic_vcq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_qp, ionic_qp, ibqp),
+	INIT_RDMA_OBJ_SIZE(ib_mw, ionic_mr, ibmw),
+};
+
+static void ionic_init_resids(struct ionic_ibdev *dev)
+{
+	ionic_resid_init(&dev->inuse_cqid, dev->lif_cfg.cq_count);
+	dev->half_cqid_udma_shift =
+		order_base_2(dev->lif_cfg.cq_count / dev->lif_cfg.udma_count);
+	ionic_resid_init(&dev->inuse_pdid, IONIC_MAX_PD);
+	ionic_resid_init(&dev->inuse_ahid, dev->lif_cfg.nahs_per_lif);
+	ionic_resid_init(&dev->inuse_mrid, dev->lif_cfg.nmrs_per_lif);
+	/* skip reserved lkey */
+	dev->next_mrkey = 1;
+	ionic_resid_init(&dev->inuse_qpid, dev->lif_cfg.qp_count);
+	/* skip reserved SMI and GSI qpids */
+	dev->half_qpid_udma_shift =
+		order_base_2(dev->lif_cfg.qp_count / dev->lif_cfg.udma_count);
+	ionic_resid_init(&dev->inuse_dbid, dev->lif_cfg.dbid_count);
+}
+
+static void ionic_destroy_resids(struct ionic_ibdev *dev)
+{
+	ionic_resid_destroy(&dev->inuse_cqid);
+	ionic_resid_destroy(&dev->inuse_pdid);
+	ionic_resid_destroy(&dev->inuse_ahid);
+	ionic_resid_destroy(&dev->inuse_mrid);
+	ionic_resid_destroy(&dev->inuse_qpid);
+	ionic_resid_destroy(&dev->inuse_dbid);
+}
+
+static void ionic_destroy_ibdev(struct ionic_ibdev *dev)
+{
+	ionic_kill_rdma_admin(dev, false);
+	ib_unregister_device(&dev->ibdev);
+	ionic_stats_cleanup(dev);
+	ionic_destroy_rdma_admin(dev);
+	ionic_destroy_resids(dev);
+	WARN_ON(!xa_empty(&dev->qp_tbl));
+	xa_destroy(&dev->qp_tbl);
+	WARN_ON(!xa_empty(&dev->cq_tbl));
+	xa_destroy(&dev->cq_tbl);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static struct ionic_ibdev *ionic_create_ibdev(struct ionic_aux_dev *ionic_adev)
+{
+	struct ib_device *ibdev;
+	struct ionic_ibdev *dev;
+	struct net_device *ndev;
+	int rc;
+
+	dev = ib_alloc_device(ionic_ibdev, ibdev);
+	if (!dev)
+		return ERR_PTR(-EINVAL);
+
+	ionic_fill_lif_cfg(ionic_adev->lif, &dev->lif_cfg);
+
+	xa_init_flags(&dev->qp_tbl, GFP_ATOMIC);
+	xa_init_flags(&dev->cq_tbl, GFP_ATOMIC);
+
+	ionic_init_resids(dev);
+
+	rc = ionic_rdma_reset_devcmd(dev);
+	if (rc)
+		goto err_reset;
+
+	rc = ionic_create_rdma_admin(dev);
+	if (rc)
+		goto err_admin;
+
+	ibdev = &dev->ibdev;
+	ibdev->dev.parent = dev->lif_cfg.hwdev;
+
+	strscpy(ibdev->name, "ionic_%d", IB_DEVICE_NAME_MAX);
+	strscpy(ibdev->node_desc, DEVICE_DESCRIPTION, IB_DEVICE_NODE_DESC_MAX);
+
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	ibdev->phys_port_cnt = 1;
+
+	/* the first two eq are reserved for async events */
+	ibdev->num_comp_vectors = dev->lif_cfg.eq_count - 2;
+
+	ndev = ionic_lif_netdev(ionic_adev->lif);
+	addrconf_ifid_eui48((u8 *)&ibdev->node_guid, ndev);
+	rc = ib_device_set_netdev(ibdev, ndev, 1);
+	/* ionic_lif_netdev() returns ndev with refcount held */
+	dev_put(ndev);
+	if (rc)
+		goto err_admin;
+
+	ib_set_device_ops(&dev->ibdev, &ionic_dev_ops);
+
+	ionic_stats_init(dev);
+
+	rc = ib_register_device(ibdev, "ionic_%d", ibdev->dev.parent);
+	if (rc)
+		goto err_register;
+
+	return dev;
+
+err_register:
+	ionic_stats_cleanup(dev);
+err_admin:
+	ionic_kill_rdma_admin(dev, false);
+	ionic_destroy_rdma_admin(dev);
+err_reset:
+	ionic_destroy_resids(dev);
+	xa_destroy(&dev->qp_tbl);
+	xa_destroy(&dev->cq_tbl);
+	ib_dealloc_device(&dev->ibdev);
+
+	return ERR_PTR(rc);
+}
+
+static int ionic_aux_probe(struct auxiliary_device *adev,
+			   const struct auxiliary_device_id *id)
+{
+	struct ionic_aux_dev *ionic_adev;
+	struct ionic_ibdev *dev;
+
+	ionic_adev = container_of(adev, struct ionic_aux_dev, adev);
+	dev = ionic_create_ibdev(ionic_adev);
+	if (IS_ERR(dev))
+		return dev_err_probe(&adev->dev, PTR_ERR(dev),
+				     "Failed to register ibdev\n");
+
+	auxiliary_set_drvdata(adev, dev);
+	ibdev_dbg(&dev->ibdev, "registered\n");
+
+	return 0;
+}
+
+static void ionic_aux_remove(struct auxiliary_device *adev)
+{
+	struct ionic_ibdev *dev = auxiliary_get_drvdata(adev);
+
+	dev_dbg(&adev->dev, "unregister ibdev\n");
+	ionic_destroy_ibdev(dev);
+	dev_dbg(&adev->dev, "unregistered\n");
+}
+
+static const struct auxiliary_device_id ionic_aux_id_table[] = {
+	{ .name = "ionic.rdma", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, ionic_aux_id_table);
+
+static struct auxiliary_driver ionic_aux_r_driver = {
+	.name = "rdma",
+	.probe = ionic_aux_probe,
+	.remove = ionic_aux_remove,
+	.id_table = ionic_aux_id_table,
+};
+
+static int __init ionic_mod_init(void)
+{
+	int rc;
+
+	ionic_evt_workq = create_workqueue(KBUILD_MODNAME "-evt");
+	if (!ionic_evt_workq)
+		return -ENOMEM;
+
+	rc = auxiliary_driver_register(&ionic_aux_r_driver);
+	if (rc)
+		goto err_aux;
+
+	return 0;
+
+err_aux:
+	destroy_workqueue(ionic_evt_workq);
+
+	return rc;
+}
+
+static void __exit ionic_mod_exit(void)
+{
+	auxiliary_driver_unregister(&ionic_aux_r_driver);
+	destroy_workqueue(ionic_evt_workq);
+}
+
+module_init(ionic_mod_init);
+module_exit(ionic_mod_exit);
diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.h b/drivers/infiniband/hw/ionic/ionic_ibdev.h
new file mode 100644
index 000000000000..82fda1e3cdb6
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_ibdev.h
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_IBDEV_H_
+#define _IONIC_IBDEV_H_
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include <rdma/ionic-abi.h>
+#include <ionic_api.h>
+#include <ionic_regs.h>
+
+#include "ionic_fw.h"
+#include "ionic_queue.h"
+#include "ionic_res.h"
+
+#include "ionic_lif_cfg.h"
+
+/* Config knobs */
+#define IONIC_EQ_DEPTH 511
+#define IONIC_EQ_COUNT 32
+#define IONIC_AQ_DEPTH 63
+#define IONIC_AQ_COUNT 4
+#define IONIC_EQ_ISR_BUDGET 10
+#define IONIC_EQ_WORK_BUDGET 1000
+#define IONIC_MAX_RD_ATOM 16
+#define IONIC_PKEY_TBL_LEN 1
+#define IONIC_GID_TBL_LEN 256
+
+#define IONIC_MAX_QPID 0xffffff
+#define IONIC_SPEC_HIGH 8
+#define IONIC_MAX_PD 1024
+#define IONIC_SPEC_HIGH 8
+#define IONIC_SQCMB_ORDER 5
+#define IONIC_RQCMB_ORDER 0
+
+#define IONIC_META_LAST		((void *)1ul)
+#define IONIC_META_POSTED	((void *)2ul)
+
+#define IONIC_CQ_GRACE 100
+
+#define IONIC_ROCE_UDP_SPORT	28272
+#define IONIC_DMA_LKEY		0
+#define IONIC_DMA_RKEY		IONIC_DMA_LKEY
+
+#define IONIC_CMB_SUPPORTED \
+	(IONIC_CMB_ENABLE | IONIC_CMB_REQUIRE | IONIC_CMB_EXPDB | \
+	 IONIC_CMB_WC | IONIC_CMB_UC)
+
+/* resource is not reserved on the device, indicated in tbl_order */
+#define IONIC_RES_INVALID	-1
+
+struct ionic_aq;
+struct ionic_cq;
+struct ionic_eq;
+struct ionic_vcq;
+
+enum ionic_admin_state {
+	IONIC_ADMIN_ACTIVE, /* submitting admin commands to queue */
+	IONIC_ADMIN_PAUSED, /* not submitting, but may complete normally */
+	IONIC_ADMIN_KILLED, /* not submitting, locally completed */
+};
+
+enum ionic_admin_flags {
+	IONIC_ADMIN_F_BUSYWAIT  = BIT(0),	/* Don't sleep */
+	IONIC_ADMIN_F_TEARDOWN  = BIT(1),	/* In destroy path */
+	IONIC_ADMIN_F_INTERRUPT = BIT(2),	/* Interruptible w/timeout */
+};
+
+enum ionic_mmap_flag {
+	IONIC_MMAP_WC = BIT(0),
+};
+
+struct ionic_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+	unsigned long size;
+	unsigned long pfn;
+	u8 mmap_flags;
+};
+
+struct ionic_ibdev {
+	struct ib_device	ibdev;
+
+	struct ionic_lif_cfg	lif_cfg;
+
+	struct xarray		qp_tbl;
+	struct xarray		cq_tbl;
+
+	struct ionic_resid_bits	inuse_dbid;
+	struct ionic_resid_bits	inuse_pdid;
+	struct ionic_resid_bits	inuse_ahid;
+	struct ionic_resid_bits	inuse_mrid;
+	struct ionic_resid_bits	inuse_qpid;
+	struct ionic_resid_bits	inuse_cqid;
+
+	u8			half_cqid_udma_shift;
+	u8			half_qpid_udma_shift;
+	u8			next_qpid_udma_idx;
+	u8			next_mrkey;
+
+	struct work_struct	reset_work;
+	bool			reset_posted;
+	u32			reset_cnt;
+
+	struct delayed_work	admin_dwork;
+	struct ionic_aq		**aq_vec;
+	atomic_t		admin_state;
+
+	struct ionic_eq		**eq_vec;
+
+	struct ionic_v1_stat	*hw_stats;
+	void			*hw_stats_buf;
+	struct rdma_stat_desc	*hw_stats_hdrs;
+	struct ionic_counter_stats *counter_stats;
+	int			hw_stats_count;
+};
+
+struct ionic_eq {
+	struct ionic_ibdev	*dev;
+
+	u32			eqid;
+	u32			intr;
+
+	struct ionic_queue	q;
+
+	int			armed;
+	bool			enable;
+
+	struct work_struct	work;
+
+	int			irq;
+	char			name[32];
+};
+
+struct ionic_admin_wr {
+	struct completion		work;
+	struct list_head		aq_ent;
+	struct ionic_v1_admin_wqe	wqe;
+	struct ionic_v1_cqe		cqe;
+	struct ionic_aq			*aq;
+	int				status;
+};
+
+struct ionic_admin_wr_q {
+	struct ionic_admin_wr	*wr;
+	int			wqe_strides;
+};
+
+struct ionic_aq {
+	struct ionic_ibdev	*dev;
+	struct ionic_vcq	*vcq;
+
+	struct work_struct	work;
+
+	atomic_t		admin_state;
+	unsigned long		stamp;
+	bool			armed;
+
+	u32			aqid;
+	u32			cqid;
+
+	spinlock_t		lock; /* for posting */
+	struct ionic_queue	q;
+	struct ionic_admin_wr_q	*q_wr;
+	struct list_head	wr_prod;
+	struct list_head	wr_post;
+};
+
+struct ionic_ctx {
+	struct ib_ucontext	ibctx;
+	u32			dbid;
+	struct rdma_user_mmap_entry	*mmap_dbell;
+};
+
+struct ionic_tbl_buf {
+	u32		tbl_limit;
+	u32		tbl_pages;
+	size_t		tbl_size;
+	__le64		*tbl_buf;
+	dma_addr_t	tbl_dma;
+	u8		page_size_log2;
+};
+
+struct ionic_pd {
+	struct ib_pd		ibpd;
+
+	u32			pdid;
+	u32			flags;
+};
+
+struct ionic_cq {
+	struct ionic_vcq	*vcq;
+
+	u32			cqid;
+	u32			eqid;
+
+	spinlock_t		lock; /* for polling */
+	struct list_head	poll_sq;
+	bool			flush;
+	struct list_head	flush_sq;
+	struct list_head	flush_rq;
+	struct list_head	ibkill_flush_ent;
+
+	struct ionic_queue	q;
+	bool			color;
+	int			credit;
+	u16			arm_any_prod;
+	u16			arm_sol_prod;
+
+	struct kref		cq_kref;
+	struct completion	cq_rel_comp;
+
+	/* infrequently accessed, keep at end */
+	struct ib_umem		*umem;
+};
+
+struct ionic_vcq {
+	struct ib_cq		ibcq;
+	struct ionic_cq		cq[2];
+	u8			udma_mask;
+	u8			poll_idx;
+};
+
+struct ionic_sq_meta {
+	u64			wrid;
+	u32			len;
+	u16			seq;
+	u8			ibop;
+	u8			ibsts;
+	u8			remote:1;
+	u8			signal:1;
+	u8			local_comp:1;
+};
+
+struct ionic_rq_meta {
+	struct ionic_rq_meta	*next;
+	u64			wrid;
+};
+
+struct ionic_qp {
+	struct ib_qp		ibqp;
+	enum ib_qp_state	state;
+
+	u32			qpid;
+	u32			ahid;
+	u32			sq_cqid;
+	u32			rq_cqid;
+	u8			udma_idx;
+	u8			has_ah:1;
+	u8			has_sq:1;
+	u8			has_rq:1;
+	u8			sig_all:1;
+
+	struct list_head	qp_list_counter;
+
+	struct list_head	cq_poll_sq;
+	struct list_head	cq_flush_sq;
+	struct list_head	cq_flush_rq;
+	struct list_head	ibkill_flush_ent;
+
+	spinlock_t		sq_lock; /* for posting and polling */
+	struct ionic_queue	sq;
+	struct ionic_sq_meta	*sq_meta;
+	u16			*sq_msn_idx;
+	int			sq_spec;
+	u16			sq_old_prod;
+	u16			sq_msn_prod;
+	u16			sq_msn_cons;
+	u8			sq_cmb;
+	bool			sq_flush;
+	bool			sq_flush_rcvd;
+
+	spinlock_t		rq_lock; /* for posting and polling */
+	struct ionic_queue	rq;
+	struct ionic_rq_meta	*rq_meta;
+	struct ionic_rq_meta	*rq_meta_head;
+	int			rq_spec;
+	u16			rq_old_prod;
+	u8			rq_cmb;
+	bool			rq_flush;
+
+	struct kref		qp_kref;
+	struct completion	qp_rel_comp;
+
+	/* infrequently accessed, keep at end */
+	int			sgid_index;
+	int			sq_cmb_order;
+	u32			sq_cmb_pgid;
+	phys_addr_t		sq_cmb_addr;
+	struct rdma_user_mmap_entry *mmap_sq_cmb;
+
+	struct ib_umem		*sq_umem;
+
+	int			rq_cmb_order;
+	u32			rq_cmb_pgid;
+	phys_addr_t		rq_cmb_addr;
+	struct rdma_user_mmap_entry *mmap_rq_cmb;
+
+	struct ib_umem		*rq_umem;
+
+	int			dcqcn_profile;
+
+	struct ib_ud_header	*hdr;
+};
+
+struct ionic_ah {
+	struct ib_ah		ibah;
+	u32			ahid;
+	int			sgid_index;
+	struct ib_ud_header	hdr;
+};
+
+struct ionic_mr {
+	union {
+		struct ib_mr	ibmr;
+		struct ib_mw	ibmw;
+	};
+
+	u32			mrid;
+	int			flags;
+
+	struct ib_umem		*umem;
+	struct ionic_tbl_buf	buf;
+	bool			created;
+};
+
+struct ionic_counter_stats {
+	int queue_stats_count;
+	struct ionic_v1_stat *hdr;
+	struct rdma_stat_desc *stats_hdrs;
+	struct xarray xa_counters;
+};
+
+struct ionic_counter {
+	void *vals;
+	struct list_head qp_list;
+};
+
+static inline struct ionic_ibdev *to_ionic_ibdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ionic_ibdev, ibdev);
+}
+
+static inline struct ionic_ctx *to_ionic_ctx(struct ib_ucontext *ibctx)
+{
+	return container_of(ibctx, struct ionic_ctx, ibctx);
+}
+
+static inline struct ionic_ctx *to_ionic_ctx_uobj(struct ib_uobject *uobj)
+{
+	if (!uobj)
+		return NULL;
+
+	if (!uobj->context)
+		return NULL;
+
+	return to_ionic_ctx(uobj->context);
+}
+
+static inline struct ionic_pd *to_ionic_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ionic_pd, ibpd);
+}
+
+static inline struct ionic_mr *to_ionic_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ionic_mr, ibmr);
+}
+
+static inline struct ionic_mr *to_ionic_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct ionic_mr, ibmw);
+}
+
+static inline struct ionic_vcq *to_ionic_vcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ionic_vcq, ibcq);
+}
+
+static inline struct ionic_cq *to_ionic_vcq_cq(struct ib_cq *ibcq,
+					       uint8_t udma_idx)
+{
+	return &to_ionic_vcq(ibcq)->cq[udma_idx];
+}
+
+static inline struct ionic_qp *to_ionic_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ionic_qp, ibqp);
+}
+
+static inline struct ionic_ah *to_ionic_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ionic_ah, ibah);
+}
+
+static inline u32 ionic_ctx_dbid(struct ionic_ibdev *dev,
+				 struct ionic_ctx *ctx)
+{
+	if (!ctx)
+		return dev->lif_cfg.dbid;
+
+	return ctx->dbid;
+}
+
+static inline u32 ionic_obj_dbid(struct ionic_ibdev *dev,
+				 struct ib_uobject *uobj)
+{
+	return ionic_ctx_dbid(dev, to_ionic_ctx_uobj(uobj));
+}
+
+static inline bool ionic_ibop_is_local(enum ib_wr_opcode op)
+{
+	return op == IB_WR_LOCAL_INV || op == IB_WR_REG_MR;
+}
+
+static inline void ionic_qp_complete(struct kref *kref)
+{
+	struct ionic_qp *qp = container_of(kref, struct ionic_qp, qp_kref);
+
+	complete(&qp->qp_rel_comp);
+}
+
+static inline void ionic_cq_complete(struct kref *kref)
+{
+	struct ionic_cq *cq = container_of(kref, struct ionic_cq, cq_kref);
+
+	complete(&cq->cq_rel_comp);
+}
+
+/* ionic_admin.c */
+extern struct workqueue_struct *ionic_evt_workq;
+void ionic_admin_post(struct ionic_ibdev *dev, struct ionic_admin_wr *wr);
+int ionic_admin_wait(struct ionic_ibdev *dev, struct ionic_admin_wr *wr,
+		     enum ionic_admin_flags);
+
+int ionic_rdma_reset_devcmd(struct ionic_ibdev *dev);
+
+int ionic_create_rdma_admin(struct ionic_ibdev *dev);
+void ionic_destroy_rdma_admin(struct ionic_ibdev *dev);
+void ionic_kill_rdma_admin(struct ionic_ibdev *dev, bool fatal_path);
+
+/* ionic_controlpath.c */
+int ionic_create_cq_common(struct ionic_vcq *vcq,
+			   struct ionic_tbl_buf *buf,
+			   const struct ib_cq_init_attr *attr,
+			   struct ionic_ctx *ctx,
+			   struct ib_udata *udata,
+			   struct ionic_qdesc *req_cq,
+			   __u32 *resp_cqid,
+			   int udma_idx);
+void ionic_destroy_cq_common(struct ionic_ibdev *dev, struct ionic_cq *cq);
+void ionic_flush_qp(struct ionic_ibdev *dev, struct ionic_qp *qp);
+void ionic_notify_flush_cq(struct ionic_cq *cq);
+
+int ionic_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata);
+void ionic_dealloc_ucontext(struct ib_ucontext *ibctx);
+int ionic_mmap(struct ib_ucontext *ibctx, struct vm_area_struct *vma);
+void ionic_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
+int ionic_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int ionic_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int ionic_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+		    struct ib_udata *udata);
+int ionic_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int ionic_destroy_ah(struct ib_ah *ibah, u32 flags);
+struct ib_mr *ionic_get_dma_mr(struct ib_pd *ibpd, int access);
+struct ib_mr *ionic_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+				u64 addr, int access, struct ib_dmah *dmah,
+				struct ib_udata *udata);
+struct ib_mr *ionic_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 offset,
+				       u64 length, u64 addr, int fd, int access,
+					   struct ib_dmah *dmah,
+				       struct uverbs_attr_bundle *attrs);
+int ionic_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *ionic_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type type,
+			     u32 max_sg);
+int ionic_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+		    unsigned int *sg_offset);
+int ionic_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata);
+int ionic_dealloc_mw(struct ib_mw *ibmw);
+int ionic_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		    struct uverbs_attr_bundle *attrs);
+int ionic_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int ionic_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
+		    struct ib_udata *udata);
+int ionic_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
+		    struct ib_udata *udata);
+int ionic_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask,
+		   struct ib_qp_init_attr *init_attr);
+int ionic_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+
+/* ionic_datapath.c */
+int ionic_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+		    const struct ib_send_wr **bad);
+int ionic_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+		    const struct ib_recv_wr **bad);
+int ionic_poll_cq(struct ib_cq *ibcq, int nwc, struct ib_wc *wc);
+int ionic_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* ionic_hw_stats.c */
+void ionic_stats_init(struct ionic_ibdev *dev);
+void ionic_stats_cleanup(struct ionic_ibdev *dev);
+
+/* ionic_pgtbl.c */
+__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va);
+__be64 ionic_pgtbl_off(struct ionic_tbl_buf *buf, u64 va);
+int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma);
+int ionic_pgtbl_init(struct ionic_ibdev *dev,
+		     struct ionic_tbl_buf *buf,
+		     struct ib_umem *umem,
+		     dma_addr_t dma,
+		     int limit,
+		     u64 page_size);
+void ionic_pgtbl_unbuf(struct ionic_ibdev *dev, struct ionic_tbl_buf *buf);
+#endif /* _IONIC_IBDEV_H_ */
diff --git a/drivers/infiniband/hw/ionic/ionic_lif_cfg.c b/drivers/infiniband/hw/ionic/ionic_lif_cfg.c
new file mode 100644
index 000000000000..f3cd281c3a2f
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_lif_cfg.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/kernel.h>
+
+#include <ionic.h>
+#include <ionic_lif.h>
+
+#include "ionic_lif_cfg.h"
+
+#define IONIC_MIN_RDMA_VERSION	0
+#define IONIC_MAX_RDMA_VERSION	2
+
+static u8 ionic_get_expdb(struct ionic_lif *lif)
+{
+	u8 expdb_support = 0;
+
+	if (lif->ionic->idev.phy_cmb_expdb64_pages)
+		expdb_support |= IONIC_EXPDB_64B_WQE;
+	if (lif->ionic->idev.phy_cmb_expdb128_pages)
+		expdb_support |= IONIC_EXPDB_128B_WQE;
+	if (lif->ionic->idev.phy_cmb_expdb256_pages)
+		expdb_support |= IONIC_EXPDB_256B_WQE;
+	if (lif->ionic->idev.phy_cmb_expdb512_pages)
+		expdb_support |= IONIC_EXPDB_512B_WQE;
+
+	return expdb_support;
+}
+
+void ionic_fill_lif_cfg(struct ionic_lif *lif, struct ionic_lif_cfg *cfg)
+{
+	union ionic_lif_identity *ident = &lif->ionic->ident.lif;
+
+	cfg->lif = lif;
+	cfg->hwdev = &lif->ionic->pdev->dev;
+	cfg->lif_index = lif->index;
+	cfg->lif_hw_index = lif->hw_index;
+
+	cfg->dbid = lif->kern_pid;
+	cfg->dbid_count = le32_to_cpu(lif->ionic->ident.dev.ndbpgs_per_lif);
+	cfg->dbpage = lif->kern_dbpage;
+	cfg->intr_ctrl = lif->ionic->idev.intr_ctrl;
+
+	cfg->db_phys = lif->ionic->bars[IONIC_PCI_BAR_DBELL].bus_addr;
+
+	if (IONIC_VERSION(ident->rdma.version, ident->rdma.minor_version) >=
+	    IONIC_VERSION(2, 1))
+		cfg->page_size_supported =
+		    le64_to_cpu(ident->rdma.page_size_cap);
+	else
+		cfg->page_size_supported = IONIC_PAGE_SIZE_SUPPORTED;
+
+	cfg->rdma_version = ident->rdma.version;
+	cfg->qp_opcodes = ident->rdma.qp_opcodes;
+	cfg->admin_opcodes = ident->rdma.admin_opcodes;
+
+	cfg->stats_type = le16_to_cpu(ident->rdma.stats_type);
+	cfg->npts_per_lif = le32_to_cpu(ident->rdma.npts_per_lif);
+	cfg->nmrs_per_lif = le32_to_cpu(ident->rdma.nmrs_per_lif);
+	cfg->nahs_per_lif = le32_to_cpu(ident->rdma.nahs_per_lif);
+
+	cfg->aq_base = le32_to_cpu(ident->rdma.aq_qtype.qid_base);
+	cfg->cq_base = le32_to_cpu(ident->rdma.cq_qtype.qid_base);
+	cfg->eq_base = le32_to_cpu(ident->rdma.eq_qtype.qid_base);
+
+	/*
+	 * ionic_create_rdma_admin() may reduce aq_count or eq_count if
+	 * it is unable to allocate all that were requested.
+	 * aq_count is tunable; see ionic_aq_count
+	 * eq_count is tunable; see ionic_eq_count
+	 */
+	cfg->aq_count = le32_to_cpu(ident->rdma.aq_qtype.qid_count);
+	cfg->eq_count = le32_to_cpu(ident->rdma.eq_qtype.qid_count);
+	cfg->cq_count = le32_to_cpu(ident->rdma.cq_qtype.qid_count);
+	cfg->qp_count = le32_to_cpu(ident->rdma.sq_qtype.qid_count);
+	cfg->dbid_count = le32_to_cpu(lif->ionic->ident.dev.ndbpgs_per_lif);
+
+	cfg->aq_qtype = ident->rdma.aq_qtype.qtype;
+	cfg->sq_qtype = ident->rdma.sq_qtype.qtype;
+	cfg->rq_qtype = ident->rdma.rq_qtype.qtype;
+	cfg->cq_qtype = ident->rdma.cq_qtype.qtype;
+	cfg->eq_qtype = ident->rdma.eq_qtype.qtype;
+	cfg->udma_qgrp_shift = ident->rdma.udma_shift;
+	cfg->udma_count = 2;
+
+	cfg->max_stride = ident->rdma.max_stride;
+	cfg->expdb_mask = ionic_get_expdb(lif);
+
+	cfg->sq_expdb =
+	    !!(lif->qtype_info[IONIC_QTYPE_TXQ].features & IONIC_QIDENT_F_EXPDB);
+	cfg->rq_expdb =
+	    !!(lif->qtype_info[IONIC_QTYPE_RXQ].features & IONIC_QIDENT_F_EXPDB);
+}
+
+struct net_device *ionic_lif_netdev(struct ionic_lif *lif)
+{
+	struct net_device *netdev = lif->netdev;
+
+	dev_hold(netdev);
+	return netdev;
+}
+
+void ionic_lif_fw_version(struct ionic_lif *lif, char *str, size_t len)
+{
+	strscpy(str, lif->ionic->idev.dev_info.fw_version, len);
+}
+
+u8 ionic_lif_asic_rev(struct ionic_lif *lif)
+{
+	return lif->ionic->idev.dev_info.asic_rev;
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_lif_cfg.h b/drivers/infiniband/hw/ionic/ionic_lif_cfg.h
new file mode 100644
index 000000000000..20853429f623
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_lif_cfg.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_LIF_CFG_H_
+
+#define IONIC_VERSION(a, b) (((a) << 16) + ((b) << 8))
+#define IONIC_PAGE_SIZE_SUPPORTED	0x40201000 /* 4kb, 2Mb, 1Gb */
+
+#define IONIC_EXPDB_64B_WQE	BIT(0)
+#define IONIC_EXPDB_128B_WQE	BIT(1)
+#define IONIC_EXPDB_256B_WQE	BIT(2)
+#define IONIC_EXPDB_512B_WQE	BIT(3)
+
+struct ionic_lif_cfg {
+	struct device *hwdev;
+	struct ionic_lif *lif;
+
+	int lif_index;
+	int lif_hw_index;
+
+	u32 dbid;
+	int dbid_count;
+	u64 __iomem *dbpage;
+	struct ionic_intr __iomem *intr_ctrl;
+	phys_addr_t db_phys;
+
+	u64 page_size_supported;
+	u32 npts_per_lif;
+	u32 nmrs_per_lif;
+	u32 nahs_per_lif;
+
+	u32 aq_base;
+	u32 cq_base;
+	u32 eq_base;
+
+	int aq_count;
+	int eq_count;
+	int cq_count;
+	int qp_count;
+
+	u16 stats_type;
+	u8 aq_qtype;
+	u8 sq_qtype;
+	u8 rq_qtype;
+	u8 cq_qtype;
+	u8 eq_qtype;
+
+	u8 udma_count;
+	u8 udma_qgrp_shift;
+
+	u8 rdma_version;
+	u8 qp_opcodes;
+	u8 admin_opcodes;
+
+	u8 max_stride;
+	bool sq_expdb;
+	bool rq_expdb;
+	u8 expdb_mask;
+};
+
+void ionic_fill_lif_cfg(struct ionic_lif *lif, struct ionic_lif_cfg *cfg);
+struct net_device *ionic_lif_netdev(struct ionic_lif *lif);
+void ionic_lif_fw_version(struct ionic_lif *lif, char *str, size_t len);
+u8 ionic_lif_asic_rev(struct ionic_lif *lif);
+
+#endif /* _IONIC_LIF_CFG_H_ */
diff --git a/drivers/infiniband/hw/ionic/ionic_pgtbl.c b/drivers/infiniband/hw/ionic/ionic_pgtbl.c
new file mode 100644
index 000000000000..e74db73c9246
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_pgtbl.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/mman.h>
+#include <linux/dma-mapping.h>
+
+#include "ionic_fw.h"
+#include "ionic_ibdev.h"
+
+__le64 ionic_pgtbl_dma(struct ionic_tbl_buf *buf, u64 va)
+{
+	u64 pg_mask = BIT_ULL(buf->page_size_log2) - 1;
+	u64 dma;
+
+	if (!buf->tbl_pages)
+		return cpu_to_le64(0);
+
+	if (buf->tbl_pages > 1)
+		return cpu_to_le64(buf->tbl_dma);
+
+	if (buf->tbl_buf)
+		dma = le64_to_cpu(buf->tbl_buf[0]);
+	else
+		dma = buf->tbl_dma;
+
+	return cpu_to_le64(dma + (va & pg_mask));
+}
+
+__be64 ionic_pgtbl_off(struct ionic_tbl_buf *buf, u64 va)
+{
+	if (buf->tbl_pages > 1) {
+		u64 pg_mask = BIT_ULL(buf->page_size_log2) - 1;
+
+		return cpu_to_be64(va & pg_mask);
+	}
+
+	return 0;
+}
+
+int ionic_pgtbl_page(struct ionic_tbl_buf *buf, u64 dma)
+{
+	if (unlikely(buf->tbl_pages == buf->tbl_limit))
+		return -ENOMEM;
+
+	if (buf->tbl_buf)
+		buf->tbl_buf[buf->tbl_pages] = cpu_to_le64(dma);
+	else
+		buf->tbl_dma = dma;
+
+	++buf->tbl_pages;
+
+	return 0;
+}
+
+static int ionic_tbl_buf_alloc(struct ionic_ibdev *dev,
+			       struct ionic_tbl_buf *buf)
+{
+	int rc;
+
+	buf->tbl_size = buf->tbl_limit * sizeof(*buf->tbl_buf);
+	buf->tbl_buf = kmalloc(buf->tbl_size, GFP_KERNEL);
+	if (!buf->tbl_buf)
+		return -ENOMEM;
+
+	buf->tbl_dma = dma_map_single(dev->lif_cfg.hwdev, buf->tbl_buf,
+				      buf->tbl_size, DMA_TO_DEVICE);
+	rc = dma_mapping_error(dev->lif_cfg.hwdev, buf->tbl_dma);
+	if (rc) {
+		kfree(buf->tbl_buf);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ionic_pgtbl_umem(struct ionic_tbl_buf *buf, struct ib_umem *umem)
+{
+	struct ib_block_iter biter;
+	u64 page_dma;
+	int rc;
+
+	rdma_umem_for_each_dma_block(umem, &biter, BIT_ULL(buf->page_size_log2)) {
+		page_dma = rdma_block_iter_dma_address(&biter);
+		rc = ionic_pgtbl_page(buf, page_dma);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+void ionic_pgtbl_unbuf(struct ionic_ibdev *dev, struct ionic_tbl_buf *buf)
+{
+	if (buf->tbl_buf)
+		dma_unmap_single(dev->lif_cfg.hwdev, buf->tbl_dma,
+				 buf->tbl_size, DMA_TO_DEVICE);
+
+	kfree(buf->tbl_buf);
+	memset(buf, 0, sizeof(*buf));
+}
+
+int ionic_pgtbl_init(struct ionic_ibdev *dev,
+		     struct ionic_tbl_buf *buf,
+		     struct ib_umem *umem,
+		     dma_addr_t dma,
+		     int limit,
+		     u64 page_size)
+{
+	int rc;
+
+	memset(buf, 0, sizeof(*buf));
+
+	if (umem) {
+		limit = ib_umem_num_dma_blocks(umem, page_size);
+		buf->page_size_log2 = order_base_2(page_size);
+	}
+
+	if (limit < 1)
+		return -EINVAL;
+
+	buf->tbl_limit = limit;
+
+	/* skip pgtbl if contiguous / direct translation */
+	if (limit > 1) {
+		rc = ionic_tbl_buf_alloc(dev, buf);
+		if (rc)
+			return rc;
+	}
+
+	if (umem)
+		rc = ionic_pgtbl_umem(buf, umem);
+	else
+		rc = ionic_pgtbl_page(buf, dma);
+
+	if (rc)
+		goto err_unbuf;
+
+	return 0;
+
+err_unbuf:
+	ionic_pgtbl_unbuf(dev, buf);
+	return rc;
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_queue.c b/drivers/infiniband/hw/ionic/ionic_queue.c
new file mode 100644
index 000000000000..aa897ed2a412
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_queue.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/dma-mapping.h>
+
+#include "ionic_queue.h"
+
+int ionic_queue_init(struct ionic_queue *q, struct device *dma_dev,
+		     int depth, size_t stride)
+{
+	if (depth < 0 || depth > 0xffff)
+		return -EINVAL;
+
+	if (stride == 0 || stride > 0x10000)
+		return -EINVAL;
+
+	if (depth == 0)
+		depth = 1;
+
+	q->depth_log2 = order_base_2(depth + 1);
+	q->stride_log2 = order_base_2(stride);
+
+	if (q->depth_log2 + q->stride_log2 < PAGE_SHIFT)
+		q->depth_log2 = PAGE_SHIFT - q->stride_log2;
+
+	if (q->depth_log2 > 16 || q->stride_log2 > 16)
+		return -EINVAL;
+
+	q->size = BIT_ULL(q->depth_log2 + q->stride_log2);
+	q->mask = BIT(q->depth_log2) - 1;
+
+	q->ptr = dma_alloc_coherent(dma_dev, q->size, &q->dma, GFP_KERNEL);
+	if (!q->ptr)
+		return -ENOMEM;
+
+	/* it will always be page aligned, but just to be sure... */
+	if (!PAGE_ALIGNED(q->ptr)) {
+		dma_free_coherent(dma_dev, q->size, q->ptr, q->dma);
+		return -ENOMEM;
+	}
+
+	q->prod = 0;
+	q->cons = 0;
+	q->dbell = 0;
+
+	return 0;
+}
+
+void ionic_queue_destroy(struct ionic_queue *q, struct device *dma_dev)
+{
+	dma_free_coherent(dma_dev, q->size, q->ptr, q->dma);
+}
diff --git a/drivers/infiniband/hw/ionic/ionic_queue.h b/drivers/infiniband/hw/ionic/ionic_queue.h
new file mode 100644
index 000000000000..d18020d4cad5
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_queue.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_QUEUE_H_
+#define _IONIC_QUEUE_H_
+
+#include <linux/io.h>
+#include <ionic_regs.h>
+
+#define IONIC_MAX_DEPTH		0xffff
+#define IONIC_MAX_CQ_DEPTH	0xffff
+#define IONIC_CQ_RING_ARM	IONIC_DBELL_RING_1
+#define IONIC_CQ_RING_SOL	IONIC_DBELL_RING_2
+
+/**
+ * struct ionic_queue - Ring buffer used between device and driver
+ * @size:	Size of the buffer, in bytes
+ * @dma:	Dma address of the buffer
+ * @ptr:	Buffer virtual address
+ * @prod:	Driver position in the queue
+ * @cons:	Device position in the queue
+ * @mask:	Capacity of the queue, subtracting the hole
+ *		This value is equal to ((1 << depth_log2) - 1)
+ * @depth_log2: Log base two size depth of the queue
+ * @stride_log2: Log base two size of an element in the queue
+ * @dbell:	Doorbell identifying bits
+ */
+struct ionic_queue {
+	size_t size;
+	dma_addr_t dma;
+	void *ptr;
+	u16 prod;
+	u16 cons;
+	u16 mask;
+	u8 depth_log2;
+	u8 stride_log2;
+	u64 dbell;
+};
+
+/**
+ * ionic_queue_init() - Initialize user space queue
+ * @q:		Uninitialized queue structure
+ * @dma_dev:	DMA device for mapping
+ * @depth:	Depth of the queue
+ * @stride:	Size of each element of the queue
+ *
+ * Return: status code
+ */
+int ionic_queue_init(struct ionic_queue *q, struct device *dma_dev,
+		     int depth, size_t stride);
+
+/**
+ * ionic_queue_destroy() - Destroy user space queue
+ * @q:		Queue structure
+ * @dma_dev:	DMA device for mapping
+ *
+ * Return: status code
+ */
+void ionic_queue_destroy(struct ionic_queue *q, struct device *dma_dev);
+
+/**
+ * ionic_queue_empty() - Test if queue is empty
+ * @q:		Queue structure
+ *
+ * This is only valid for to-device queues.
+ *
+ * Return: is empty
+ */
+static inline bool ionic_queue_empty(struct ionic_queue *q)
+{
+	return q->prod == q->cons;
+}
+
+/**
+ * ionic_queue_length() - Get the current length of the queue
+ * @q:		Queue structure
+ *
+ * This is only valid for to-device queues.
+ *
+ * Return: length
+ */
+static inline u16 ionic_queue_length(struct ionic_queue *q)
+{
+	return (q->prod - q->cons) & q->mask;
+}
+
+/**
+ * ionic_queue_length_remaining() - Get the remaining length of the queue
+ * @q:		Queue structure
+ *
+ * This is only valid for to-device queues.
+ *
+ * Return: length remaining
+ */
+static inline u16 ionic_queue_length_remaining(struct ionic_queue *q)
+{
+	return q->mask - ionic_queue_length(q);
+}
+
+/**
+ * ionic_queue_full() - Test if queue is full
+ * @q:		Queue structure
+ *
+ * This is only valid for to-device queues.
+ *
+ * Return: is full
+ */
+static inline bool ionic_queue_full(struct ionic_queue *q)
+{
+	return q->mask == ionic_queue_length(q);
+}
+
+/**
+ * ionic_color_wrap() - Flip the color if prod is wrapped
+ * @prod:	Queue index just after advancing
+ * @color:	Queue color just prior to advancing the index
+ *
+ * Return: color after advancing the index
+ */
+static inline bool ionic_color_wrap(u16 prod, bool color)
+{
+	/* logical xor color with (prod == 0) */
+	return color != (prod == 0);
+}
+
+/**
+ * ionic_queue_at() - Get the element at the given index
+ * @q:		Queue structure
+ * @idx:	Index in the queue
+ *
+ * The index must be within the bounds of the queue.  It is not checked here.
+ *
+ * Return: pointer to element at index
+ */
+static inline void *ionic_queue_at(struct ionic_queue *q, u16 idx)
+{
+	return q->ptr + ((unsigned long)idx << q->stride_log2);
+}
+
+/**
+ * ionic_queue_at_prod() - Get the element at the producer index
+ * @q:		Queue structure
+ *
+ * Return: pointer to element at producer index
+ */
+static inline void *ionic_queue_at_prod(struct ionic_queue *q)
+{
+	return ionic_queue_at(q, q->prod);
+}
+
+/**
+ * ionic_queue_at_cons() - Get the element at the consumer index
+ * @q:		Queue structure
+ *
+ * Return: pointer to element at consumer index
+ */
+static inline void *ionic_queue_at_cons(struct ionic_queue *q)
+{
+	return ionic_queue_at(q, q->cons);
+}
+
+/**
+ * ionic_queue_next() - Compute the next index
+ * @q:		Queue structure
+ * @idx:	Index
+ *
+ * Return: next index after idx
+ */
+static inline u16 ionic_queue_next(struct ionic_queue *q, u16 idx)
+{
+	return (idx + 1) & q->mask;
+}
+
+/**
+ * ionic_queue_produce() - Increase the producer index
+ * @q:		Queue structure
+ *
+ * Caller must ensure that the queue is not full.  It is not checked here.
+ */
+static inline void ionic_queue_produce(struct ionic_queue *q)
+{
+	q->prod = ionic_queue_next(q, q->prod);
+}
+
+/**
+ * ionic_queue_consume() - Increase the consumer index
+ * @q:		Queue structure
+ *
+ * Caller must ensure that the queue is not empty.  It is not checked here.
+ *
+ * This is only valid for to-device queues.
+ */
+static inline void ionic_queue_consume(struct ionic_queue *q)
+{
+	q->cons = ionic_queue_next(q, q->cons);
+}
+
+/**
+ * ionic_queue_consume_entries() - Increase the consumer index by entries
+ * @q:				Queue structure
+ * @entries:		Number of entries to increment
+ *
+ * Caller must ensure that the queue is not empty.  It is not checked here.
+ *
+ * This is only valid for to-device queues.
+ */
+static inline void ionic_queue_consume_entries(struct ionic_queue *q,
+					       u16 entries)
+{
+	q->cons = (q->cons + entries) & q->mask;
+}
+
+/**
+ * ionic_queue_dbell_init() - Initialize doorbell bits for queue id
+ * @q:		Queue structure
+ * @qid:	Queue identifying number
+ */
+static inline void ionic_queue_dbell_init(struct ionic_queue *q, u32 qid)
+{
+	q->dbell = IONIC_DBELL_QID(qid);
+}
+
+/**
+ * ionic_queue_dbell_val() - Get current doorbell update value
+ * @q:		Queue structure
+ *
+ * Return: current doorbell update value
+ */
+static inline u64 ionic_queue_dbell_val(struct ionic_queue *q)
+{
+	return q->dbell | q->prod;
+}
+
+#endif /* _IONIC_QUEUE_H_ */
diff --git a/drivers/infiniband/hw/ionic/ionic_res.h b/drivers/infiniband/hw/ionic/ionic_res.h
new file mode 100644
index 000000000000..46c8c584bd9a
--- /dev/null
+++ b/drivers/infiniband/hw/ionic/ionic_res.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_RES_H_
+#define _IONIC_RES_H_
+
+#include <linux/kernel.h>
+#include <linux/idr.h>
+
+/**
+ * struct ionic_resid_bits - Number allocator based on IDA
+ *
+ * @inuse:      IDA handle
+ * @inuse_size: Highest ID limit for IDA
+ */
+struct ionic_resid_bits {
+	struct ida inuse;
+	unsigned int inuse_size;
+};
+
+/**
+ * ionic_resid_init() - Initialize a resid allocator
+ * @resid:  Uninitialized resid allocator
+ * @size:   Capacity of the allocator
+ *
+ * Return: Zero on success, or negative error number
+ */
+static inline void ionic_resid_init(struct ionic_resid_bits *resid,
+				    unsigned int size)
+{
+	resid->inuse_size = size;
+	ida_init(&resid->inuse);
+}
+
+/**
+ * ionic_resid_destroy() - Destroy a resid allocator
+ * @resid:  Resid allocator
+ */
+static inline void ionic_resid_destroy(struct ionic_resid_bits *resid)
+{
+	ida_destroy(&resid->inuse);
+}
+
+/**
+ * ionic_resid_get_shared() - Allocate an available shared resource id
+ * @resid:   Resid allocator
+ * @min:     Smallest valid resource id
+ * @size:    One after largest valid resource id
+ *
+ * Return: Resource id, or negative error number
+ */
+static inline int ionic_resid_get_shared(struct ionic_resid_bits *resid,
+					 unsigned int min,
+					 unsigned int size)
+{
+	return ida_alloc_range(&resid->inuse, min, size - 1, GFP_KERNEL);
+}
+
+/**
+ * ionic_resid_get() - Allocate an available resource id
+ * @resid: Resid allocator
+ *
+ * Return: Resource id, or negative error number
+ */
+static inline int ionic_resid_get(struct ionic_resid_bits *resid)
+{
+	return ionic_resid_get_shared(resid, 0, resid->inuse_size);
+}
+
+/**
+ * ionic_resid_put() - Free a resource id
+ * @resid:  Resid allocator
+ * @id:     Resource id
+ */
+static inline void ionic_resid_put(struct ionic_resid_bits *resid, int id)
+{
+	ida_free(&resid->inuse, id);
+}
+
+/**
+ * ionic_bitid_to_qid() - Transform a resource bit index into a queue id
+ * @bitid:           Bit index
+ * @qgrp_shift:      Log2 number of queues per queue group
+ * @half_qid_shift:  Log2 of half the total number of queues
+ *
+ * Return: Queue id
+ *
+ * Udma-constrained queues (QPs and CQs) are associated with their udma by
+ * queue group. Even queue groups are associated with udma0, and odd queue
+ * groups with udma1.
+ *
+ * For allocating queue ids, we want to arrange the bits into two halves,
+ * with the even queue groups of udma0 in the lower half of the bitset,
+ * and the odd queue groups of udma1 in the upper half of the bitset.
+ * Then, one or two calls of find_next_zero_bit can examine all the bits
+ * for queues of an entire udma.
+ *
+ * For example, assuming eight queue groups with qgrp qids per group:
+ *
+ * bitid 0*qgrp..1*qgrp-1 : qid 0*qgrp..1*qgrp-1
+ * bitid 1*qgrp..2*qgrp-1 : qid 2*qgrp..3*qgrp-1
+ * bitid 2*qgrp..3*qgrp-1 : qid 4*qgrp..5*qgrp-1
+ * bitid 3*qgrp..4*qgrp-1 : qid 6*qgrp..7*qgrp-1
+ * bitid 4*qgrp..5*qgrp-1 : qid 1*qgrp..2*qgrp-1
+ * bitid 5*qgrp..6*qgrp-1 : qid 3*qgrp..4*qgrp-1
+ * bitid 6*qgrp..7*qgrp-1 : qid 5*qgrp..6*qgrp-1
+ * bitid 7*qgrp..8*qgrp-1 : qid 7*qgrp..8*qgrp-1
+ *
+ * There are three important ranges of bits in the qid.  There is the udma
+ * bit "U" at qgrp_shift, which is the least significant bit of the group
+ * index, and determines which udma a queue is associated with.
+ * The bits of lesser significance we can call the idx bits "I", which are
+ * the index of the queue within the group.  The bits of greater significance
+ * we can call the grp bits "G", which are other bits of the group index that
+ * do not determine the udma.  Those bits are just rearranged in the bit index
+ * in the bitset.  A bitid has the udma bit in the most significant place,
+ * then the grp bits, then the idx bits.
+ *
+ * bitid: 00000000000000 U GGG IIIIII
+ * qid:   00000000000000 GGG U IIIIII
+ *
+ * Transforming from bit index to qid, or from qid to bit index, can be
+ * accomplished by rearranging the bits by masking and shifting.
+ */
+static inline u32 ionic_bitid_to_qid(u32 bitid, u8 qgrp_shift,
+				     u8 half_qid_shift)
+{
+	u32 udma_bit =
+		(bitid & BIT(half_qid_shift)) >> (half_qid_shift - qgrp_shift);
+	u32 grp_bits = (bitid & GENMASK(half_qid_shift - 1, qgrp_shift)) << 1;
+	u32 idx_bits = bitid & (BIT(qgrp_shift) - 1);
+
+	return grp_bits | udma_bit | idx_bits;
+}
+
+/**
+ * ionic_qid_to_bitid() - Transform a queue id into a resource bit index
+ * @qid:            queue index
+ * @qgrp_shift:     Log2 number of queues per queue group
+ * @half_qid_shift: Log2 of half the total number of queues
+ *
+ * Return: Resource bit index
+ *
+ * This is the inverse of ionic_bitid_to_qid().
+ */
+static inline u32 ionic_qid_to_bitid(u32 qid, u8 qgrp_shift, u8 half_qid_shift)
+{
+	u32 udma_bit = (qid & BIT(qgrp_shift)) << (half_qid_shift - qgrp_shift);
+	u32 grp_bits = (qid & GENMASK(half_qid_shift, qgrp_shift + 1)) >> 1;
+	u32 idx_bits = qid & (BIT(qgrp_shift) - 1);
+
+	return udma_bit | grp_bits | idx_bits;
+}
+#endif /* _IONIC_RES_H_ */
diff --git a/drivers/infiniband/hw/irdma/Kconfig b/drivers/infiniband/hw/irdma/Kconfig
index 5f49a58590ed..0bd7e3fca1fb 100644
--- a/drivers/infiniband/hw/irdma/Kconfig
+++ b/drivers/infiniband/hw/irdma/Kconfig
@@ -4,10 +4,11 @@ config INFINIBAND_IRDMA
 	depends on INET
 	depends on IPV6 || !IPV6
 	depends on PCI
-	depends on ICE && I40E
+	depends on IDPF && ICE && I40E
 	select GENERIC_ALLOCATOR
 	select AUXILIARY_BUS
 	select CRC32
 	help
-	  This is an Intel(R) Ethernet Protocol Driver for RDMA driver
-	  that support E810 (iWARP/RoCE) and X722 (iWARP) network devices.
+	  This is an Intel(R) Ethernet Protocol Driver for RDMA that
+	  supports IPU E2000 (RoCEv2), E810 (iWARP/RoCEv2) and X722 (iWARP)
+	  network devices.
diff --git a/drivers/infiniband/hw/irdma/Makefile b/drivers/infiniband/hw/irdma/Makefile
index 48c3854235a0..03ceb9e5475f 100644
--- a/drivers/infiniband/hw/irdma/Makefile
+++ b/drivers/infiniband/hw/irdma/Makefile
@@ -13,7 +13,10 @@ irdma-objs := cm.o        \
               hw.o        \
               i40iw_hw.o  \
               i40iw_if.o  \
+	      ig3rdma_if.o\
+	      icrdma_if.o \
               icrdma_hw.o \
+	      ig3rdma_hw.o\
               main.o      \
               pble.o      \
               puda.o      \
@@ -22,6 +25,7 @@ irdma-objs := cm.o        \
               uk.o        \
               utils.o     \
               verbs.o     \
+	      virtchnl.o  \
               ws.o        \
 
 CFLAGS_trace.o = -I$(src)
diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c
index 99a7f1a6c0b5..4ef1c29032f7 100644
--- a/drivers/infiniband/hw/irdma/ctrl.c
+++ b/drivers/infiniband/hw/irdma/ctrl.c
@@ -74,6 +74,14 @@ static void irdma_set_qos_info(struct irdma_sc_vsi  *vsi,
 {
 	u8 i;
 
+	if (vsi->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) {
+			vsi->qos[i].qs_handle = vsi->dev->qos[i].qs_handle;
+			vsi->qos[i].valid = true;
+		}
+
+		return;
+	}
 	vsi->qos_rel_bw = l2p->vsi_rel_bw;
 	vsi->qos_prio_type = l2p->vsi_prio_type;
 	vsi->dscp_mode = l2p->dscp_mode;
@@ -404,7 +412,8 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info)
 	pble_obj_cnt = info->pd->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt;
 
 	if ((info->virtual_map && info->sq_pa >= pble_obj_cnt) ||
-	    (info->virtual_map && info->rq_pa >= pble_obj_cnt))
+	    (!info->qp_uk_init_info.srq_uk &&
+	     info->virtual_map && info->rq_pa >= pble_obj_cnt))
 		return -EINVAL;
 
 	qp->llp_stream_handle = (void *)(-1);
@@ -439,6 +448,208 @@ int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info)
 }
 
 /**
+ * irdma_sc_srq_init - init sc_srq structure
+ * @srq: srq sc struct
+ * @info: parameters for srq init
+ */
+int irdma_sc_srq_init(struct irdma_sc_srq *srq,
+		      struct irdma_srq_init_info *info)
+{
+	u32 srq_size_quanta;
+	int ret_code;
+
+	ret_code = irdma_uk_srq_init(&srq->srq_uk, &info->srq_uk_init_info);
+	if (ret_code)
+		return ret_code;
+
+	srq->dev = info->pd->dev;
+	srq->pd = info->pd;
+	srq->vsi = info->vsi;
+	srq->srq_pa = info->srq_pa;
+	srq->first_pm_pbl_idx = info->first_pm_pbl_idx;
+	srq->pasid = info->pasid;
+	srq->pasid_valid = info->pasid_valid;
+	srq->srq_limit = info->srq_limit;
+	srq->leaf_pbl_size = info->leaf_pbl_size;
+	srq->virtual_map = info->virtual_map;
+	srq->tph_en = info->tph_en;
+	srq->arm_limit_event = info->arm_limit_event;
+	srq->tph_val = info->tph_value;
+	srq->shadow_area_pa = info->shadow_area_pa;
+
+	/* Smallest SRQ size is 256B i.e. 8 quanta */
+	srq_size_quanta = max((u32)IRDMA_SRQ_MIN_QUANTA,
+			      srq->srq_uk.srq_size *
+			      srq->srq_uk.wqe_size_multiplier);
+	srq->hw_srq_size = irdma_get_encoded_wqe_size(srq_size_quanta,
+						      IRDMA_QUEUE_TYPE_SRQ);
+
+	return 0;
+}
+
+/**
+ * irdma_sc_srq_create - send srq create CQP WQE
+ * @srq: srq sc struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static int irdma_sc_srq_create(struct irdma_sc_srq *srq, u64 scratch,
+			       bool post_sq)
+{
+	struct irdma_sc_cqp *cqp;
+	__le64 *wqe;
+	u64 hdr;
+
+	cqp = srq->pd->dev->cqp;
+	if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id ||
+	    srq->srq_uk.srq_id >
+	    (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1))
+		return -EINVAL;
+
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return -ENOMEM;
+
+	set_64bit_val(wqe, 0,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, srq->srq_limit) |
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) |
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size));
+	set_64bit_val(wqe, 8, (uintptr_t)srq);
+	set_64bit_val(wqe, 16,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id));
+	set_64bit_val(wqe, 32,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR,
+				 srq->srq_pa >>
+				 IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S));
+	set_64bit_val(wqe, 40,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR,
+				 srq->shadow_area_pa >>
+				 IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S));
+	set_64bit_val(wqe, 48,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX,
+				 srq->first_pm_pbl_idx));
+
+	hdr = srq->srq_uk.srq_id |
+	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_SRQ) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT,
+			 srq->arm_limit_event) |
+	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
+
+	dma_wmb(); /* make sure WQE is written before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	print_hex_dump_debug("WQE: SRQ_CREATE WQE", DUMP_PREFIX_OFFSET, 16, 8,
+			     wqe, IRDMA_CQP_WQE_SIZE * 8, false);
+	if (post_sq)
+		irdma_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * irdma_sc_srq_modify - send modify_srq CQP WQE
+ * @srq: srq sc struct
+ * @info: parameters for srq modification
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static int irdma_sc_srq_modify(struct irdma_sc_srq *srq,
+			       struct irdma_modify_srq_info *info, u64 scratch,
+			       bool post_sq)
+{
+	struct irdma_sc_cqp *cqp;
+	__le64 *wqe;
+	u64 hdr;
+
+	cqp = srq->dev->cqp;
+	if (srq->srq_uk.srq_id < cqp->dev->hw_attrs.min_hw_srq_id ||
+	    srq->srq_uk.srq_id >
+	    (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].max_cnt - 1))
+		return -EINVAL;
+
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return -ENOMEM;
+
+	set_64bit_val(wqe, 0,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQ_LIMIT, info->srq_limit) |
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_RQSIZE, srq->hw_srq_size) |
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE, srq->srq_uk.wqe_size));
+	set_64bit_val(wqe, 8,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_SRQCTX, srq->srq_uk.srq_id));
+	set_64bit_val(wqe, 16,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_PD_ID, srq->pd->pd_id));
+	set_64bit_val(wqe, 32,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR,
+				 srq->srq_pa >>
+				 IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S));
+	set_64bit_val(wqe, 40,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR,
+				 srq->shadow_area_pa >>
+				 IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S));
+	set_64bit_val(wqe, 48,
+		      FIELD_PREP(IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX,
+				 srq->first_pm_pbl_idx));
+
+	hdr = srq->srq_uk.srq_id |
+	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_MODIFY_SRQ) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE, srq->leaf_pbl_size) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_VIRTMAP, srq->virtual_map) |
+	      FIELD_PREP(IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT,
+			 info->arm_limit_event) |
+	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
+	dma_wmb(); /* make sure WQE is written before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	print_hex_dump_debug("WQE: SRQ_MODIFY WQE", DUMP_PREFIX_OFFSET, 16, 8,
+			     wqe, IRDMA_CQP_WQE_SIZE * 8, false);
+	if (post_sq)
+		irdma_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * irdma_sc_srq_destroy - send srq_destroy CQP WQE
+ * @srq: srq sc struct
+ * @scratch: u64 saved to be used during cqp completion
+ * @post_sq: flag for cqp db to ring
+ */
+static int irdma_sc_srq_destroy(struct irdma_sc_srq *srq, u64 scratch,
+				bool post_sq)
+{
+	struct irdma_sc_cqp *cqp;
+	__le64 *wqe;
+	u64 hdr;
+
+	cqp = srq->dev->cqp;
+
+	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return -ENOMEM;
+
+	set_64bit_val(wqe, 8, (uintptr_t)srq);
+
+	hdr = srq->srq_uk.srq_id |
+	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_SRQ) |
+	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
+	dma_wmb(); /* make sure WQE is written before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	print_hex_dump_debug("WQE: SRQ_DESTROY WQE", DUMP_PREFIX_OFFSET, 16,
+			     8, wqe, IRDMA_CQP_WQE_SIZE * 8, false);
+	if (post_sq)
+		irdma_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
  * irdma_sc_qp_create - create qp
  * @qp: sc qp
  * @info: qp create info
@@ -629,13 +840,14 @@ static u8 irdma_sc_get_encoded_ird_size(u16 ird_size)
 }
 
 /**
- * irdma_sc_qp_setctx_roce - set qp's context
+ * irdma_sc_qp_setctx_roce_gen_2 - set qp's context
  * @qp: sc qp
  * @qp_ctx: context ptr
  * @info: ctx info
  */
-void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx,
-			     struct irdma_qp_host_ctx_info *info)
+static void irdma_sc_qp_setctx_roce_gen_2(struct irdma_sc_qp *qp,
+					  __le64 *qp_ctx,
+					  struct irdma_qp_host_ctx_info *info)
 {
 	struct irdma_roce_offload_info *roce_info;
 	struct irdma_udp_offload_info *udp;
@@ -753,6 +965,189 @@ void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx,
 			     8, qp_ctx, IRDMA_QP_CTX_SIZE, false);
 }
 
+/**
+ * irdma_sc_get_encoded_ird_size_gen_3 - get encoded IRD size for GEN 3
+ * @ird_size: IRD size
+ * The ird from the connection is rounded to a supported HW setting and then encoded
+ * for ird_size field of qp_ctx. Consumers are expected to provide valid ird size based
+ * on hardware attributes. IRD size defaults to a value of 4 in case of invalid input.
+ */
+static u8 irdma_sc_get_encoded_ird_size_gen_3(u16 ird_size)
+{
+	switch (ird_size ?
+		roundup_pow_of_two(2 * ird_size) : 4) {
+	case 4096:
+		return IRDMA_IRD_HW_SIZE_4096_GEN3;
+	case 2048:
+		return IRDMA_IRD_HW_SIZE_2048_GEN3;
+	case 1024:
+		return IRDMA_IRD_HW_SIZE_1024_GEN3;
+	case 512:
+		return IRDMA_IRD_HW_SIZE_512_GEN3;
+	case 256:
+		return IRDMA_IRD_HW_SIZE_256_GEN3;
+	case 128:
+		return IRDMA_IRD_HW_SIZE_128_GEN3;
+	case 64:
+		return IRDMA_IRD_HW_SIZE_64_GEN3;
+	case 32:
+		return IRDMA_IRD_HW_SIZE_32_GEN3;
+	case 16:
+		return IRDMA_IRD_HW_SIZE_16_GEN3;
+	case 8:
+		return IRDMA_IRD_HW_SIZE_8_GEN3;
+	case 4:
+	default:
+		break;
+	}
+
+	return IRDMA_IRD_HW_SIZE_4_GEN3;
+}
+
+/**
+ * irdma_sc_qp_setctx_roce_gen_3 - set qp's context
+ * @qp: sc qp
+ * @qp_ctx: context ptr
+ * @info: ctx info
+ */
+static void irdma_sc_qp_setctx_roce_gen_3(struct irdma_sc_qp *qp,
+					  __le64 *qp_ctx,
+					  struct irdma_qp_host_ctx_info *info)
+{
+	struct irdma_roce_offload_info *roce_info = info->roce_info;
+	struct irdma_udp_offload_info *udp = info->udp_info;
+	u64 qw0, qw3, qw7 = 0, qw8 = 0;
+	u8 push_mode_en;
+	u32 push_idx;
+
+	qp->user_pri = info->user_pri;
+	if (qp->push_idx == IRDMA_INVALID_PUSH_PAGE_INDEX) {
+		push_mode_en = 0;
+		push_idx = 0;
+	} else {
+		push_mode_en = 1;
+		push_idx = qp->push_idx;
+	}
+
+	qw0 = FIELD_PREP(IRDMAQPC_RQWQESIZE, qp->qp_uk.rq_wqe_size) |
+	      FIELD_PREP(IRDMAQPC_RCVTPHEN, qp->rcv_tph_en) |
+	      FIELD_PREP(IRDMAQPC_XMITTPHEN, qp->xmit_tph_en) |
+	      FIELD_PREP(IRDMAQPC_RQTPHEN, qp->rq_tph_en) |
+	      FIELD_PREP(IRDMAQPC_SQTPHEN, qp->sq_tph_en) |
+	      FIELD_PREP(IRDMAQPC_PPIDX, push_idx) |
+	      FIELD_PREP(IRDMAQPC_PMENA, push_mode_en) |
+	      FIELD_PREP(IRDMAQPC_DC_TCP_EN, roce_info->dctcp_en) |
+	      FIELD_PREP(IRDMAQPC_ISQP1, roce_info->is_qp1) |
+	      FIELD_PREP(IRDMAQPC_ROCE_TVER, roce_info->roce_tver) |
+	      FIELD_PREP(IRDMAQPC_IPV4, udp->ipv4) |
+	      FIELD_PREP(IRDMAQPC_USE_SRQ, !qp->qp_uk.srq_uk ? 0 : 1) |
+	      FIELD_PREP(IRDMAQPC_INSERTVLANTAG, udp->insert_vlan_tag);
+	set_64bit_val(qp_ctx, 0, qw0);
+	set_64bit_val(qp_ctx, 8, qp->sq_pa);
+	set_64bit_val(qp_ctx, 16, qp->rq_pa);
+	qw3 = FIELD_PREP(IRDMAQPC_RQSIZE, qp->hw_rq_size) |
+	      FIELD_PREP(IRDMAQPC_SQSIZE, qp->hw_sq_size) |
+	      FIELD_PREP(IRDMAQPC_TTL, udp->ttl) |
+	      FIELD_PREP(IRDMAQPC_TOS, udp->tos) |
+	      FIELD_PREP(IRDMAQPC_SRCPORTNUM, udp->src_port) |
+	      FIELD_PREP(IRDMAQPC_DESTPORTNUM, udp->dst_port);
+	set_64bit_val(qp_ctx, 24, qw3);
+	set_64bit_val(qp_ctx, 32,
+		      FIELD_PREP(IRDMAQPC_DESTIPADDR2, udp->dest_ip_addr[2]) |
+		      FIELD_PREP(IRDMAQPC_DESTIPADDR3, udp->dest_ip_addr[3]));
+	set_64bit_val(qp_ctx, 40,
+		      FIELD_PREP(IRDMAQPC_DESTIPADDR0, udp->dest_ip_addr[0]) |
+		      FIELD_PREP(IRDMAQPC_DESTIPADDR1, udp->dest_ip_addr[1]));
+	set_64bit_val(qp_ctx, 48,
+		      FIELD_PREP(IRDMAQPC_SNDMSS, udp->snd_mss) |
+		      FIELD_PREP(IRDMAQPC_VLANTAG, udp->vlan_tag) |
+		      FIELD_PREP(IRDMAQPC_ARPIDX, udp->arp_idx));
+	qw7 =  FIELD_PREP(IRDMAQPC_PKEY, roce_info->p_key) |
+	       FIELD_PREP(IRDMAQPC_ACKCREDITS, roce_info->ack_credits) |
+	       FIELD_PREP(IRDMAQPC_FLOWLABEL, udp->flow_label);
+	set_64bit_val(qp_ctx, 56, qw7);
+	qw8 = FIELD_PREP(IRDMAQPC_QKEY, roce_info->qkey) |
+	      FIELD_PREP(IRDMAQPC_DESTQP, roce_info->dest_qp);
+	set_64bit_val(qp_ctx, 64, qw8);
+	set_64bit_val(qp_ctx, 80,
+		      FIELD_PREP(IRDMAQPC_PSNNXT, udp->psn_nxt) |
+		      FIELD_PREP(IRDMAQPC_LSN, udp->lsn));
+	set_64bit_val(qp_ctx, 88,
+		      FIELD_PREP(IRDMAQPC_EPSN, udp->epsn));
+	set_64bit_val(qp_ctx, 96,
+		      FIELD_PREP(IRDMAQPC_PSNMAX, udp->psn_max) |
+		      FIELD_PREP(IRDMAQPC_PSNUNA, udp->psn_una));
+	set_64bit_val(qp_ctx, 112,
+		      FIELD_PREP(IRDMAQPC_CWNDROCE, udp->cwnd));
+	set_64bit_val(qp_ctx, 128,
+		      FIELD_PREP(IRDMAQPC_MINRNR_TIMER, udp->min_rnr_timer) |
+		      FIELD_PREP(IRDMAQPC_RNRNAK_THRESH, udp->rnr_nak_thresh) |
+		      FIELD_PREP(IRDMAQPC_REXMIT_THRESH, udp->rexmit_thresh) |
+		      FIELD_PREP(IRDMAQPC_RNRNAK_TMR, udp->rnr_nak_tmr) |
+		      FIELD_PREP(IRDMAQPC_RTOMIN, roce_info->rtomin));
+	set_64bit_val(qp_ctx, 136,
+		      FIELD_PREP(IRDMAQPC_TXCQNUM, info->send_cq_num) |
+		      FIELD_PREP(IRDMAQPC_RXCQNUM, info->rcv_cq_num));
+	set_64bit_val(qp_ctx, 152,
+		      FIELD_PREP(IRDMAQPC_MACADDRESS,
+				 ether_addr_to_u64(roce_info->mac_addr)) |
+		      FIELD_PREP(IRDMAQPC_LOCALACKTIMEOUT,
+				 roce_info->local_ack_timeout));
+	set_64bit_val(qp_ctx, 160,
+		      FIELD_PREP(IRDMAQPC_ORDSIZE_GEN3, roce_info->ord_size) |
+		      FIELD_PREP(IRDMAQPC_IRDSIZE_GEN3,
+				 irdma_sc_get_encoded_ird_size_gen_3(roce_info->ird_size)) |
+		      FIELD_PREP(IRDMAQPC_WRRDRSPOK, roce_info->wr_rdresp_en) |
+		      FIELD_PREP(IRDMAQPC_RDOK, roce_info->rd_en) |
+		      FIELD_PREP(IRDMAQPC_USESTATSINSTANCE,
+				 info->stats_idx_valid) |
+		      FIELD_PREP(IRDMAQPC_BINDEN, roce_info->bind_en) |
+		      FIELD_PREP(IRDMAQPC_FASTREGEN, roce_info->fast_reg_en) |
+		      FIELD_PREP(IRDMAQPC_DCQCNENABLE, roce_info->dcqcn_en) |
+		      FIELD_PREP(IRDMAQPC_RCVNOICRC, roce_info->rcv_no_icrc) |
+		      FIELD_PREP(IRDMAQPC_FW_CC_ENABLE,
+				 roce_info->fw_cc_enable) |
+		      FIELD_PREP(IRDMAQPC_UDPRIVCQENABLE,
+				 roce_info->udprivcq_en) |
+		      FIELD_PREP(IRDMAQPC_PRIVEN, roce_info->priv_mode_en) |
+		      FIELD_PREP(IRDMAQPC_REMOTE_ATOMIC_EN,
+				 info->remote_atomics_en) |
+		      FIELD_PREP(IRDMAQPC_TIMELYENABLE, roce_info->timely_en));
+	set_64bit_val(qp_ctx, 168,
+		      FIELD_PREP(IRDMAQPC_QPCOMPCTX, info->qp_compl_ctx));
+	set_64bit_val(qp_ctx, 176,
+		      FIELD_PREP(IRDMAQPC_SQTPHVAL, qp->sq_tph_val) |
+		      FIELD_PREP(IRDMAQPC_RQTPHVAL, qp->rq_tph_val) |
+		      FIELD_PREP(IRDMAQPC_QSHANDLE, qp->qs_handle));
+	set_64bit_val(qp_ctx, 184,
+		      FIELD_PREP(IRDMAQPC_LOCAL_IPADDR3, udp->local_ipaddr[3]) |
+		      FIELD_PREP(IRDMAQPC_LOCAL_IPADDR2, udp->local_ipaddr[2]));
+	set_64bit_val(qp_ctx, 192,
+		      FIELD_PREP(IRDMAQPC_LOCAL_IPADDR1, udp->local_ipaddr[1]) |
+		      FIELD_PREP(IRDMAQPC_LOCAL_IPADDR0, udp->local_ipaddr[0]));
+	set_64bit_val(qp_ctx, 200,
+		      FIELD_PREP(IRDMAQPC_THIGH, roce_info->t_high) |
+		      FIELD_PREP(IRDMAQPC_SRQ_ID,
+				 !qp->qp_uk.srq_uk ?
+					0 : qp->qp_uk.srq_uk->srq_id) |
+		      FIELD_PREP(IRDMAQPC_TLOW, roce_info->t_low));
+	set_64bit_val(qp_ctx, 208, roce_info->pd_id |
+		      FIELD_PREP(IRDMAQPC_STAT_INDEX_GEN3, info->stats_idx) |
+		      FIELD_PREP(IRDMAQPC_PKT_LIMIT, qp->pkt_limit));
+
+	print_hex_dump_debug("WQE: QP_HOST ROCE CTX WQE", DUMP_PREFIX_OFFSET,
+			     16, 8, qp_ctx, IRDMA_QP_CTX_SIZE, false);
+}
+
+void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx,
+			     struct irdma_qp_host_ctx_info *info)
+{
+	if (qp->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_2)
+		irdma_sc_qp_setctx_roce_gen_2(qp, qp_ctx, info);
+	else
+		irdma_sc_qp_setctx_roce_gen_3(qp, qp_ctx, info);
+}
+
 /* irdma_sc_alloc_local_mac_entry - allocate a mac entry
  * @cqp: struct for cqp hw
  * @scratch: u64 saved to be used during cqp completion
@@ -1080,7 +1475,8 @@ static int irdma_sc_alloc_stag(struct irdma_sc_dev *dev,
 		      FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID) |
 		      FIELD_PREP(IRDMA_CQPSQ_STAG_STAGLEN, info->total_len));
 	set_64bit_val(wqe, 16,
-		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx));
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx) |
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18));
 	set_64bit_val(wqe, 40,
 		      FIELD_PREP(IRDMA_CQPSQ_STAG_HMCFNIDX, info->hmc_fcn_index));
 
@@ -1096,6 +1492,8 @@ static int irdma_sc_alloc_stag(struct irdma_sc_dev *dev,
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_REMACCENABLED, info->remote_access) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_USEHMCFNIDX, info->use_hmc_fcn_index) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_USEPFRID, info->use_pf_rid) |
+	      FIELD_PREP(IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN,
+			 info->remote_atomics_en) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -1165,6 +1563,7 @@ static int irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev,
 		      FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID));
 	set_64bit_val(wqe, 16,
 		      FIELD_PREP(IRDMA_CQPSQ_STAG_KEY, info->stag_key) |
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18) |
 		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx));
 	if (!info->chunk_size) {
 		set_64bit_val(wqe, 32, info->reg_addr_pa);
@@ -1187,6 +1586,8 @@ static int irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev,
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_VABASEDTO, addr_type) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_USEHMCFNIDX, info->use_hmc_fcn_index) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_USEPFRID, info->use_pf_rid) |
+	      FIELD_PREP(IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN,
+			 info->remote_atomics_en) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -1223,7 +1624,8 @@ static int irdma_sc_dealloc_stag(struct irdma_sc_dev *dev,
 	set_64bit_val(wqe, 8,
 		      FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID));
 	set_64bit_val(wqe, 16,
-		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx));
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->stag_idx) |
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18));
 
 	hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DEALLOC_STAG) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_MR, info->mr) |
@@ -1263,7 +1665,8 @@ static int irdma_sc_mw_alloc(struct irdma_sc_dev *dev,
 	set_64bit_val(wqe, 8,
 		      FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID));
 	set_64bit_val(wqe, 16,
-		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->mw_stag_index));
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_IDX, info->mw_stag_index) |
+		      FIELD_PREP(IRDMA_CQPSQ_STAG_PDID_HI, info->pd_id >> 18));
 
 	hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_ALLOC_STAG) |
 	      FIELD_PREP(IRDMA_CQPSQ_STAG_MWTYPE, info->mw_wide) |
@@ -1343,6 +1746,7 @@ int irdma_sc_mr_fast_register(struct irdma_sc_qp *qp,
 	      FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) |
 	      FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) |
 	      FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) |
+	      FIELD_PREP(IRDMAQPSQ_REMOTE_ATOMICS_EN, info->remote_atomics_en) |
 	      FIELD_PREP(IRDMAQPSQ_VALID, qp->qp_uk.swqe_polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -1873,7 +2277,7 @@ void irdma_sc_vsi_init(struct irdma_sc_vsi  *vsi,
 		mutex_init(&vsi->qos[i].qos_mutex);
 		INIT_LIST_HEAD(&vsi->qos[i].qplist);
 	}
-	if (vsi->register_qset) {
+	if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_2) {
 		vsi->dev->ws_add = irdma_ws_add;
 		vsi->dev->ws_remove = irdma_ws_remove;
 		vsi->dev->ws_reset = irdma_ws_reset;
@@ -1888,7 +2292,7 @@ void irdma_sc_vsi_init(struct irdma_sc_vsi  *vsi,
  * irdma_get_stats_idx - Return stats index
  * @vsi: pointer to the vsi
  */
-static u8 irdma_get_stats_idx(struct irdma_sc_vsi *vsi)
+static u16 irdma_get_stats_idx(struct irdma_sc_vsi *vsi)
 {
 	struct irdma_stats_inst_info stats_info = {};
 	struct irdma_sc_dev *dev = vsi->dev;
@@ -1964,12 +2368,13 @@ int irdma_vsi_stats_init(struct irdma_sc_vsi *vsi,
 		(void *)((uintptr_t)stats_buff_mem->va +
 			 IRDMA_GATHER_STATS_BUF_SIZE);
 
-	irdma_hw_stats_start_timer(vsi);
+	if (vsi->dev->hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3)
+		irdma_hw_stats_start_timer(vsi);
 
 	/* when stat allocation is not required default to fcn_id. */
 	vsi->stats_idx = info->fcn_id;
 	if (info->alloc_stats_inst) {
-		u8 stats_idx = irdma_get_stats_idx(vsi);
+		u16 stats_idx = irdma_get_stats_idx(vsi);
 
 		if (stats_idx != IRDMA_INVALID_STATS_IDX) {
 			vsi->stats_inst_alloc = true;
@@ -1993,7 +2398,7 @@ void irdma_vsi_stats_free(struct irdma_sc_vsi *vsi)
 {
 	struct irdma_stats_inst_info stats_info = {};
 	struct irdma_sc_dev *dev = vsi->dev;
-	u8 stats_idx = vsi->stats_idx;
+	u16 stats_idx = vsi->stats_idx;
 
 	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) {
 		if (vsi->stats_inst_alloc) {
@@ -2009,7 +2414,9 @@ void irdma_vsi_stats_free(struct irdma_sc_vsi *vsi)
 
 	if (!vsi->pestat)
 		return;
-	irdma_hw_stats_stop_timer(vsi);
+
+	if (dev->hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3)
+		irdma_hw_stats_stop_timer(vsi);
 	dma_free_coherent(vsi->pestat->hw->device,
 			  vsi->pestat->gather_info.stats_buff_mem.size,
 			  vsi->pestat->gather_info.stats_buff_mem.va,
@@ -2026,6 +2433,14 @@ u8 irdma_get_encoded_wqe_size(u32 wqsize, enum irdma_queue_type queue_type)
 {
 	u8 encoded_size = 0;
 
+	if (queue_type == IRDMA_QUEUE_TYPE_SRQ) {
+		/* Smallest SRQ size is 256B (8 quanta) that gets
+		 * encoded to 0.
+		 */
+		encoded_size = ilog2(wqsize) - 3;
+
+		return encoded_size;
+	}
 	/* cqp sq's hw coded value starts from 1 for size of 4
 	 * while it starts from 0 for qp' wq's.
 	 */
@@ -2259,6 +2674,12 @@ int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp,
 		info->ae_code | FIELD_PREP(IRDMA_CQPSQ_FWQE_AESOURCE,
 					   info->ae_src) : 0;
 	set_64bit_val(wqe, 8, temp);
+	if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		set_64bit_val(wqe, 40,
+			      FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_SQ_IDX, info->err_sq_idx));
+		set_64bit_val(wqe, 48,
+			      FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_RQ_IDX, info->err_rq_idx));
+	}
 
 	hdr = qp->qp_uk.qp_id |
 	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_FLUSH_WQES) |
@@ -2267,6 +2688,9 @@ int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp,
 	      FIELD_PREP(IRDMA_CQPSQ_FWQE_FLUSHSQ, flush_sq) |
 	      FIELD_PREP(IRDMA_CQPSQ_FWQE_FLUSHRQ, flush_rq) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
+	if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		hdr |= FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_SQ_IDX_VALID, info->err_sq_idx_valid) |
+		       FIELD_PREP(IRDMA_CQPSQ_FWQE_ERR_RQ_IDX_VALID, info->err_rq_idx_valid);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
 	set_64bit_val(wqe, 24, hdr);
@@ -2562,6 +2986,9 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch,
 	      FIELD_PREP(IRDMA_CQPSQ_CQ_LPBLSIZE, cq->pbl_chunk_size) |
 	      FIELD_PREP(IRDMA_CQPSQ_CQ_CHKOVERFLOW, check_overflow) |
 	      FIELD_PREP(IRDMA_CQPSQ_CQ_VIRTMAP, cq->virtual_map) |
+	      FIELD_PREP(IRDMA_CQPSQ_CQ_CQID_HIGH, cq->cq_uk.cq_id >> 22) |
+	      FIELD_PREP(IRDMA_CQPSQ_CQ_CEQID_HIGH,
+			 (cq->ceq_id_valid ? cq->ceq_id : 0) >> 10) |
 	      FIELD_PREP(IRDMA_CQPSQ_CQ_ENCEQEMASK, cq->ceqe_mask) |
 	      FIELD_PREP(IRDMA_CQPSQ_CQ_CEQIDVALID, cq->ceq_id_valid) |
 	      FIELD_PREP(IRDMA_CQPSQ_TPHEN, cq->tph_en) |
@@ -2706,6 +3133,41 @@ static int irdma_sc_cq_modify(struct irdma_sc_cq *cq,
 }
 
 /**
+ * irdma_sc_get_decoded_ird_size_gen_3 - get decoded IRD size for GEN 3
+ * @ird_enc: IRD encoding
+ * IRD size defaults to a value of 4 in case of invalid input.
+ */
+static u16 irdma_sc_get_decoded_ird_size_gen_3(u8 ird_enc)
+{
+	switch (ird_enc) {
+	case IRDMA_IRD_HW_SIZE_4096_GEN3:
+		return 4096;
+	case IRDMA_IRD_HW_SIZE_2048_GEN3:
+		return 2048;
+	case IRDMA_IRD_HW_SIZE_1024_GEN3:
+		return 1024;
+	case IRDMA_IRD_HW_SIZE_512_GEN3:
+		return 512;
+	case IRDMA_IRD_HW_SIZE_256_GEN3:
+		return 256;
+	case IRDMA_IRD_HW_SIZE_128_GEN3:
+		return 128;
+	case IRDMA_IRD_HW_SIZE_64_GEN3:
+		return 64;
+	case IRDMA_IRD_HW_SIZE_32_GEN3:
+		return 32;
+	case IRDMA_IRD_HW_SIZE_16_GEN3:
+		return 16;
+	case IRDMA_IRD_HW_SIZE_8_GEN3:
+		return 8;
+	case IRDMA_IRD_HW_SIZE_4_GEN3:
+		return 4;
+	default:
+		return 4;
+	}
+}
+
+/**
  * irdma_check_cqp_progress - check cqp processing progress
  * @timeout: timeout info struct
  * @dev: sc device struct
@@ -2738,6 +3200,89 @@ static inline void irdma_get_cqp_reg_info(struct irdma_sc_cqp *cqp, u32 *val,
 }
 
 /**
+ * irdma_sc_cqp_def_cmpl_ae_handler - remove completed requests from pending list
+ * @dev: sc device struct
+ * @info: AE entry info
+ * @first: true if this is the first call to this handler for given AEQE
+ * @scratch: (out) scratch entry pointer
+ * @sw_def_info: (in/out) SW ticket value for this AE
+ *
+ * In case of AE_DEF_CMPL event, this function should be called in a loop
+ * until it returns NULL-ptr via scratch.
+ * For each call, it looks for a matching CQP request on pending list,
+ * removes it from the list and returns the pointer to the associated scratch
+ * entry.
+ * If this is the first call to this function for given AEQE, sw_def_info
+ * value is not used to find matching requests.  Instead, it is populated
+ * with the value from the first matching cqp_request on the list.
+ * For subsequent calls, ooo_op->sw_def_info need to match the value passed
+ * by a caller.
+ *
+ * Return: scratch entry pointer for cqp_request to be released or NULL
+ * if no matching request is found.
+ */
+void irdma_sc_cqp_def_cmpl_ae_handler(struct irdma_sc_dev *dev,
+				      struct irdma_aeqe_info *info,
+				      bool first, u64 *scratch,
+				      u32 *sw_def_info)
+{
+	struct irdma_ooo_cqp_op *ooo_op;
+	unsigned long flags;
+
+	*scratch = 0;
+
+	spin_lock_irqsave(&dev->cqp->ooo_list_lock, flags);
+	list_for_each_entry(ooo_op, &dev->cqp->ooo_pnd, list_entry) {
+		if (ooo_op->deferred &&
+		    ((first && ooo_op->def_info == info->def_info) ||
+		     (!first && ooo_op->sw_def_info == *sw_def_info))) {
+			*sw_def_info = ooo_op->sw_def_info;
+			*scratch = ooo_op->scratch;
+
+			list_move(&ooo_op->list_entry, &dev->cqp->ooo_avail);
+			atomic64_inc(&dev->cqp->completed_ops);
+
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&dev->cqp->ooo_list_lock, flags);
+
+	if (first && !*scratch)
+		ibdev_dbg(to_ibdev(dev),
+			  "AEQ: deferred completion with unknown ticket: def_info 0x%x\n",
+			   info->def_info);
+}
+
+/**
+ * irdma_sc_cqp_cleanup_handler - remove requests from pending list
+ * @dev: sc device struct
+ *
+ * This function should be called in a loop from irdma_cleanup_pending_cqp_op.
+ * For each call, it returns first CQP request on pending list, removes it
+ * from the list and returns the pointer to the associated scratch entry.
+ *
+ * Return: scratch entry pointer for cqp_request to be released or NULL
+ * if pending list is empty.
+ */
+u64 irdma_sc_cqp_cleanup_handler(struct irdma_sc_dev *dev)
+{
+	struct irdma_ooo_cqp_op *ooo_op;
+	u64 scratch = 0;
+
+	list_for_each_entry(ooo_op, &dev->cqp->ooo_pnd, list_entry) {
+		scratch = ooo_op->scratch;
+
+		list_del(&ooo_op->list_entry);
+		list_add(&ooo_op->list_entry, &dev->cqp->ooo_avail);
+		atomic64_inc(&dev->cqp->completed_ops);
+
+		break;
+	}
+
+	return scratch;
+}
+
+/**
  * irdma_cqp_poll_registers - poll cqp registers
  * @cqp: struct for cqp hw
  * @tail: wqtail register value
@@ -2794,7 +3339,10 @@ static u64 irdma_sc_decode_fpm_commit(struct irdma_sc_dev *dev, __le64 *buf,
 		obj_info[rsrc_idx].cnt = (u32)FLD_RS_64(dev, temp, IRDMA_COMMIT_FPM_CQCNT);
 		break;
 	case IRDMA_HMC_IW_APBVT_ENTRY:
-		obj_info[rsrc_idx].cnt = 1;
+		if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2)
+			obj_info[rsrc_idx].cnt = 1;
+		else
+			obj_info[rsrc_idx].cnt = 0;
 		break;
 	default:
 		obj_info[rsrc_idx].cnt = (u32)temp;
@@ -2829,7 +3377,8 @@ irdma_sc_parse_fpm_commit_buf(struct irdma_sc_dev *dev, __le64 *buf,
 				   IRDMA_HMC_IW_QP);
 	irdma_sc_decode_fpm_commit(dev, buf, 8, info,
 				   IRDMA_HMC_IW_CQ);
-	/* skiping RSRVD */
+	irdma_sc_decode_fpm_commit(dev, buf, 16, info,
+				   IRDMA_HMC_IW_SRQ);
 	irdma_sc_decode_fpm_commit(dev, buf, 24, info,
 				   IRDMA_HMC_IW_HTE);
 	irdma_sc_decode_fpm_commit(dev, buf, 32, info,
@@ -2864,15 +3413,17 @@ irdma_sc_parse_fpm_commit_buf(struct irdma_sc_dev *dev, __le64 *buf,
 					   IRDMA_HMC_IW_HDR);
 		irdma_sc_decode_fpm_commit(dev, buf, 152, info,
 					   IRDMA_HMC_IW_MD);
-		irdma_sc_decode_fpm_commit(dev, buf, 160, info,
-					   IRDMA_HMC_IW_OOISC);
-		irdma_sc_decode_fpm_commit(dev, buf, 168, info,
-					   IRDMA_HMC_IW_OOISCFFL);
+		if (dev->cqp->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) {
+			irdma_sc_decode_fpm_commit(dev, buf, 160, info,
+						   IRDMA_HMC_IW_OOISC);
+			irdma_sc_decode_fpm_commit(dev, buf, 168, info,
+						   IRDMA_HMC_IW_OOISCFFL);
+		}
 	}
 
 	/* searching for the last object in HMC to find the size of the HMC area. */
 	for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) {
-		if (info[i].base > max_base) {
+		if (info[i].base > max_base && info[i].cnt) {
 			max_base = info[i].base;
 			last_hmc_obj = i;
 		}
@@ -2927,6 +3478,7 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 					struct irdma_hmc_fpm_misc *hmc_fpm_misc)
 {
 	struct irdma_hmc_obj_info *obj_info;
+	u8 ird_encoding;
 	u64 temp;
 	u32 size;
 	u16 max_pe_sds;
@@ -2935,7 +3487,19 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 
 	get_64bit_val(buf, 0, &temp);
 	hmc_info->first_sd_index = (u16)FIELD_GET(IRDMA_QUERY_FPM_FIRST_PE_SD_INDEX, temp);
-	max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS, temp);
+
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS_GEN3, temp);
+	else
+		max_pe_sds = (u16)FIELD_GET(IRDMA_QUERY_FPM_MAX_PE_SDS, temp);
+
+	/* Reduce SD count for unprivleged functions by 1 to account for PBLE
+	 * backing page rounding
+	 */
+	if (dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2 &&
+	    (hmc_info->hmc_fn_id >= dev->hw_attrs.first_hw_vf_fpm_id ||
+	    !dev->privileged))
+		max_pe_sds--;
 
 	hmc_fpm_misc->max_sds = max_pe_sds;
 	hmc_info->sd_table.sd_cnt = max_pe_sds + hmc_info->first_sd_index;
@@ -2949,11 +3513,17 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 	size = (u32)(temp >> 32);
 	obj_info[IRDMA_HMC_IW_CQ].size = BIT_ULL(size);
 
+	irdma_sc_decode_fpm_query(buf, 24, obj_info, IRDMA_HMC_IW_SRQ);
 	irdma_sc_decode_fpm_query(buf, 32, obj_info, IRDMA_HMC_IW_HTE);
 	irdma_sc_decode_fpm_query(buf, 40, obj_info, IRDMA_HMC_IW_ARP);
 
-	obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 8192;
-	obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 1;
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 0;
+		obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 0;
+	} else {
+		obj_info[IRDMA_HMC_IW_APBVT_ENTRY].size = 8192;
+		obj_info[IRDMA_HMC_IW_APBVT_ENTRY].max_cnt = 1;
+	}
 
 	irdma_sc_decode_fpm_query(buf, 48, obj_info, IRDMA_HMC_IW_MR);
 	irdma_sc_decode_fpm_query(buf, 56, obj_info, IRDMA_HMC_IW_XF);
@@ -2962,7 +3532,7 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 	obj_info[IRDMA_HMC_IW_XFFL].max_cnt = (u32)temp;
 	obj_info[IRDMA_HMC_IW_XFFL].size = 4;
 	hmc_fpm_misc->xf_block_size = FIELD_GET(IRDMA_QUERY_FPM_XFBLOCKSIZE, temp);
-	if (!hmc_fpm_misc->xf_block_size)
+	if (obj_info[IRDMA_HMC_IW_XF].max_cnt && !hmc_fpm_misc->xf_block_size)
 		return -EINVAL;
 
 	irdma_sc_decode_fpm_query(buf, 72, obj_info, IRDMA_HMC_IW_Q1);
@@ -2984,6 +3554,14 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 	hmc_fpm_misc->max_ceqs = FIELD_GET(IRDMA_QUERY_FPM_MAX_CEQS, temp);
 	hmc_fpm_misc->ht_multiplier = FIELD_GET(IRDMA_QUERY_FPM_HTMULTIPLIER, temp);
 	hmc_fpm_misc->timer_bucket = FIELD_GET(IRDMA_QUERY_FPM_TIMERBUCKET, temp);
+	if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2,
+		      dev->feature_info[IRDMA_FTN_FLAGS])) {
+		ird_encoding = (u8)FIELD_GET(IRDMA_QUERY_FPM_MAX_IRD, temp);
+		hmc_fpm_misc->ird =
+			irdma_sc_get_decoded_ird_size_gen_3(ird_encoding) / 2;
+		dev->hw_attrs.max_hw_ird = hmc_fpm_misc->ird;
+		dev->hw_attrs.max_hw_ord = hmc_fpm_misc->ird;
+	}
 	if (dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1)
 		return 0;
 	irdma_sc_decode_fpm_query(buf, 96, obj_info, IRDMA_HMC_IW_FSIMC);
@@ -3000,15 +3578,25 @@ static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf,
 
 	irdma_sc_decode_fpm_query(buf, 144, obj_info, IRDMA_HMC_IW_HDR);
 	irdma_sc_decode_fpm_query(buf, 152, obj_info, IRDMA_HMC_IW_MD);
-	irdma_sc_decode_fpm_query(buf, 160, obj_info, IRDMA_HMC_IW_OOISC);
-
-	get_64bit_val(buf, 168, &temp);
-	obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt = (u32)temp;
-	obj_info[IRDMA_HMC_IW_OOISCFFL].size = 4;
-	hmc_fpm_misc->ooiscf_block_size = FIELD_GET(IRDMA_QUERY_FPM_OOISCFBLOCKSIZE, temp);
-	if (!hmc_fpm_misc->ooiscf_block_size &&
-	    obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt)
-		return -EINVAL;
+
+	if (dev->cqp->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) {
+		irdma_sc_decode_fpm_query(buf, 160, obj_info, IRDMA_HMC_IW_OOISC);
+
+		get_64bit_val(buf, 168, &temp);
+		obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt = (u32)temp;
+		obj_info[IRDMA_HMC_IW_OOISCFFL].size = 4;
+		hmc_fpm_misc->ooiscf_block_size = FIELD_GET(IRDMA_QUERY_FPM_OOISCFBLOCKSIZE, temp);
+		if (!hmc_fpm_misc->ooiscf_block_size &&
+		    obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt)
+			return -EINVAL;
+	}
+
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		get_64bit_val(buf, 176, &temp);
+		hmc_fpm_misc->loc_mem_pages = (u32)FIELD_GET(IRDMA_QUERY_FPM_LOC_MEM_PAGES, temp);
+		if (!hmc_fpm_misc->loc_mem_pages)
+			return -EINVAL;
+	}
 
 	return 0;
 }
@@ -3088,6 +3676,8 @@ exit:
 int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp,
 		      struct irdma_cqp_init_info *info)
 {
+	struct irdma_ooo_cqp_op *ooo_op;
+	u32 num_ooo_ops;
 	u8 hw_sq_size;
 
 	if (info->sq_size > IRDMA_CQP_SW_SQSIZE_2048 ||
@@ -3118,17 +3708,43 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp,
 	cqp->rocev2_rto_policy = info->rocev2_rto_policy;
 	cqp->protocol_used = info->protocol_used;
 	memcpy(&cqp->dcqcn_params, &info->dcqcn_params, sizeof(cqp->dcqcn_params));
+	if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		cqp->ooisc_blksize = info->ooisc_blksize;
+		cqp->rrsp_blksize = info->rrsp_blksize;
+		cqp->q1_blksize = info->q1_blksize;
+		cqp->xmit_blksize = info->xmit_blksize;
+		cqp->blksizes_valid = info->blksizes_valid;
+		cqp->ts_shift = info->ts_shift;
+		cqp->ts_override = info->ts_override;
+		cqp->en_fine_grained_timers = info->en_fine_grained_timers;
+		cqp->pe_en_vf_cnt = info->pe_en_vf_cnt;
+		cqp->ooo_op_array = info->ooo_op_array;
+		/* initialize the OOO lists */
+		INIT_LIST_HEAD(&cqp->ooo_avail);
+		INIT_LIST_HEAD(&cqp->ooo_pnd);
+		if (cqp->ooo_op_array) {
+			/* Populate avail list entries */
+			for (num_ooo_ops = 0, ooo_op = info->ooo_op_array;
+			     num_ooo_ops < cqp->sq_size;
+			     num_ooo_ops++, ooo_op++)
+				list_add(&ooo_op->list_entry, &cqp->ooo_avail);
+		}
+	}
 	info->dev->cqp = cqp;
 
 	IRDMA_RING_INIT(cqp->sq_ring, cqp->sq_size);
+	cqp->last_def_cmpl_ticket = 0;
+	cqp->sw_def_cmpl_ticket = 0;
 	cqp->requested_ops = 0;
 	atomic64_set(&cqp->completed_ops, 0);
 	/* for the cqp commands backlog. */
 	INIT_LIST_HEAD(&cqp->dev->cqp_cmd_head);
 
 	writel(0, cqp->dev->hw_regs[IRDMA_CQPTAIL]);
-	writel(0, cqp->dev->hw_regs[IRDMA_CQPDB]);
-	writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]);
+	if (cqp->dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2) {
+		writel(0, cqp->dev->hw_regs[IRDMA_CQPDB]);
+		writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]);
+	}
 
 	ibdev_dbg(to_ibdev(cqp->dev),
 		  "WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n",
@@ -3160,6 +3776,7 @@ int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err)
 		return -ENOMEM;
 
 	spin_lock_init(&cqp->dev->cqp_lock);
+	spin_lock_init(&cqp->ooo_list_lock);
 
 	temp = FIELD_PREP(IRDMA_CQPHC_SQSIZE, cqp->hw_sq_size) |
 	       FIELD_PREP(IRDMA_CQPHC_SVER, cqp->struct_ver) |
@@ -3171,12 +3788,29 @@ int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err)
 			FIELD_PREP(IRDMA_CQPHC_PROTOCOL_USED,
 				   cqp->protocol_used);
 	}
+	if (hw_rev >= IRDMA_GEN_3)
+		temp |= FIELD_PREP(IRDMA_CQPHC_EN_FINE_GRAINED_TIMERS,
+				   cqp->en_fine_grained_timers);
 
 	set_64bit_val(cqp->host_ctx, 0, temp);
 	set_64bit_val(cqp->host_ctx, 8, cqp->sq_pa);
 
 	temp = FIELD_PREP(IRDMA_CQPHC_ENABLED_VFS, cqp->ena_vf_count) |
 	       FIELD_PREP(IRDMA_CQPHC_HMC_PROFILE, cqp->hmc_profile);
+
+	if (hw_rev >= IRDMA_GEN_3)
+		temp |= FIELD_PREP(IRDMA_CQPHC_OOISC_BLKSIZE,
+				   cqp->ooisc_blksize) |
+			FIELD_PREP(IRDMA_CQPHC_RRSP_BLKSIZE,
+				   cqp->rrsp_blksize) |
+			FIELD_PREP(IRDMA_CQPHC_Q1_BLKSIZE, cqp->q1_blksize) |
+			FIELD_PREP(IRDMA_CQPHC_XMIT_BLKSIZE,
+				   cqp->xmit_blksize) |
+			FIELD_PREP(IRDMA_CQPHC_BLKSIZES_VALID,
+				   cqp->blksizes_valid) |
+			FIELD_PREP(IRDMA_CQPHC_TIMESTAMP_OVERRIDE,
+				   cqp->ts_override) |
+			FIELD_PREP(IRDMA_CQPHC_TS_SHIFT, cqp->ts_shift);
 	set_64bit_val(cqp->host_ctx, 16, temp);
 	set_64bit_val(cqp->host_ctx, 24, (uintptr_t)cqp);
 	temp = FIELD_PREP(IRDMA_CQPHC_HW_MAJVER, cqp->hw_maj_ver) |
@@ -3338,6 +3972,87 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
 }
 
 /**
+ * irdma_sc_process_def_cmpl - process deferred or pending completion
+ * @cqp: CQP sc struct
+ * @info: CQP CQE info
+ * @wqe_idx: CQP WQE descriptor index
+ * @def_info: deferred op ticket value or out-of-order completion id
+ * @def_cmpl: true for deferred completion, false for pending (RCA)
+ */
+static void irdma_sc_process_def_cmpl(struct irdma_sc_cqp *cqp,
+				      struct irdma_ccq_cqe_info *info,
+				      u32 wqe_idx, u32 def_info, bool def_cmpl)
+{
+	struct irdma_ooo_cqp_op *ooo_op;
+	unsigned long flags;
+
+	/* Deferred and out-of-order completions share the same list of pending
+	 * completions.  Since the list can be also accessed from AE handler,
+	 * it must be protected by a lock.
+	 */
+	spin_lock_irqsave(&cqp->ooo_list_lock, flags);
+
+	/* For deferred completions bump up SW completion ticket value. */
+	if (def_cmpl) {
+		cqp->last_def_cmpl_ticket = def_info;
+		cqp->sw_def_cmpl_ticket++;
+	}
+	if (!list_empty(&cqp->ooo_avail)) {
+		ooo_op = (struct irdma_ooo_cqp_op *)
+			 list_entry(cqp->ooo_avail.next,
+				    struct irdma_ooo_cqp_op, list_entry);
+
+		list_del(&ooo_op->list_entry);
+		ooo_op->scratch = info->scratch;
+		ooo_op->def_info = def_info;
+		ooo_op->sw_def_info = cqp->sw_def_cmpl_ticket;
+		ooo_op->deferred = def_cmpl;
+		ooo_op->wqe_idx = wqe_idx;
+		/* Pending completions must be chronologically ordered,
+		 * so adding at the end of list.
+		 */
+		list_add_tail(&ooo_op->list_entry, &cqp->ooo_pnd);
+	}
+	spin_unlock_irqrestore(&cqp->ooo_list_lock, flags);
+
+	info->pending = true;
+}
+
+/**
+ * irdma_sc_process_ooo_cmpl - process out-of-order (final) completion
+ * @cqp: CQP sc struct
+ * @info: CQP CQE info
+ * @def_info: out-of-order completion id
+ */
+static void irdma_sc_process_ooo_cmpl(struct irdma_sc_cqp *cqp,
+				      struct irdma_ccq_cqe_info *info,
+				      u32 def_info)
+{
+	struct irdma_ooo_cqp_op *ooo_op_tmp;
+	struct irdma_ooo_cqp_op *ooo_op;
+	unsigned long flags;
+
+	info->scratch = 0;
+
+	spin_lock_irqsave(&cqp->ooo_list_lock, flags);
+	list_for_each_entry_safe(ooo_op, ooo_op_tmp, &cqp->ooo_pnd,
+				 list_entry) {
+		if (!ooo_op->deferred && ooo_op->def_info == def_info) {
+			list_del(&ooo_op->list_entry);
+			info->scratch = ooo_op->scratch;
+			list_add(&ooo_op->list_entry, &cqp->ooo_avail);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&cqp->ooo_list_lock, flags);
+
+	if (!info->scratch)
+		ibdev_dbg(to_ibdev(cqp->dev),
+			  "CQP: DEBUG_FW_OOO out-of-order completion with unknown def_info = 0x%x\n",
+			  def_info);
+}
+
+/**
  * irdma_sc_ccq_get_cqe_info - get ccq's cq entry
  * @ccq: ccq sc struct
  * @info: completion q entry to return
@@ -3345,6 +4060,10 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq)
 int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq,
 			      struct irdma_ccq_cqe_info *info)
 {
+	u32 def_info;
+	bool def_cmpl = false;
+	bool pend_cmpl = false;
+	bool ooo_final_cmpl = false;
 	u64 qp_ctx, temp, temp1;
 	__le64 *cqe;
 	struct irdma_sc_cqp *cqp;
@@ -3352,6 +4071,7 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq,
 	u32 error;
 	u8 polarity;
 	int ret_code = 0;
+	unsigned long flags;
 
 	if (ccq->cq_uk.avoid_mem_cflct)
 		cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(&ccq->cq_uk);
@@ -3383,6 +4103,25 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq,
 
 	get_64bit_val(cqe, 16, &temp1);
 	info->op_ret_val = (u32)FIELD_GET(IRDMA_CCQ_OPRETVAL, temp1);
+	if (cqp->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		def_cmpl = info->maj_err_code == IRDMA_CQPSQ_MAJ_NO_ERROR &&
+			   info->min_err_code == IRDMA_CQPSQ_MIN_DEF_CMPL;
+		def_info = (u32)FIELD_GET(IRDMA_CCQ_DEFINFO, temp1);
+
+		pend_cmpl = info->maj_err_code == IRDMA_CQPSQ_MAJ_NO_ERROR &&
+			    info->min_err_code == IRDMA_CQPSQ_MIN_OOO_CMPL;
+
+		ooo_final_cmpl = (bool)FIELD_GET(IRDMA_OOO_CMPL, temp);
+
+		if (def_cmpl || pend_cmpl || ooo_final_cmpl) {
+			if (ooo_final_cmpl)
+				irdma_sc_process_ooo_cmpl(cqp, info, def_info);
+			else
+				irdma_sc_process_def_cmpl(cqp, info, wqe_idx,
+							  def_info, def_cmpl);
+		}
+	}
+
 	get_64bit_val(cqp->sq_base[wqe_idx].elem, 24, &temp1);
 	info->op_code = (u8)FIELD_GET(IRDMA_CQPSQ_OPCODE, temp1);
 	info->cqp = cqp;
@@ -3399,7 +4138,16 @@ int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq,
 
 	dma_wmb(); /* make sure shadow area is updated before moving tail */
 
-	IRDMA_RING_MOVE_TAIL(cqp->sq_ring);
+	spin_lock_irqsave(&cqp->dev->cqp_lock, flags);
+	if (!ooo_final_cmpl)
+		IRDMA_RING_MOVE_TAIL(cqp->sq_ring);
+	spin_unlock_irqrestore(&cqp->dev->cqp_lock, flags);
+
+	/* Do not increment completed_ops counter on pending or deferred
+	 * completions.
+	 */
+	if (pend_cmpl || def_cmpl)
+		return ret_code;
 	atomic64_inc(&cqp->completed_ops);
 
 	return ret_code;
@@ -3647,7 +4395,7 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
 	ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
 	ceq->tph_en = info->tph_en;
 	ceq->tph_val = info->tph_val;
-	ceq->vsi = info->vsi;
+	ceq->vsi_idx = info->vsi_idx;
 	ceq->polarity = 1;
 	IRDMA_RING_INIT(ceq->ceq_ring, ceq->elem_cnt);
 	ceq->dev->ceq[info->ceq_id] = ceq;
@@ -3680,13 +4428,16 @@ static int irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 scratch,
 		      (ceq->virtual_map ? ceq->first_pm_pbl_idx : 0));
 	set_64bit_val(wqe, 56,
 		      FIELD_PREP(IRDMA_CQPSQ_TPHVAL, ceq->tph_val) |
-		      FIELD_PREP(IRDMA_CQPSQ_VSIIDX, ceq->vsi->vsi_idx));
+		      FIELD_PREP(IRDMA_CQPSQ_PASID, ceq->pasid) |
+		      FIELD_PREP(IRDMA_CQPSQ_VSIIDX, ceq->vsi_idx));
 	hdr = FIELD_PREP(IRDMA_CQPSQ_CEQ_CEQID, ceq->ceq_id) |
+	      FIELD_PREP(IRDMA_CQPSQ_CEQ_CEQID_HIGH, ceq->ceq_id >> 10) |
 	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_CEQ) |
 	      FIELD_PREP(IRDMA_CQPSQ_CEQ_LPBLSIZE, ceq->pbl_chunk_size) |
 	      FIELD_PREP(IRDMA_CQPSQ_CEQ_VMAP, ceq->virtual_map) |
 	      FIELD_PREP(IRDMA_CQPSQ_CEQ_ITRNOEXPIRE, ceq->itr_no_expire) |
 	      FIELD_PREP(IRDMA_CQPSQ_TPHEN, ceq->tph_en) |
+	      FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, ceq->pasid_valid) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -3741,7 +4492,7 @@ int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch)
 	int ret_code;
 	struct irdma_sc_dev *dev = ceq->dev;
 
-	dev->ccq->vsi = ceq->vsi;
+	dev->ccq->vsi_idx = ceq->vsi_idx;
 	if (ceq->reg_cq) {
 		ret_code = irdma_sc_add_cq_ctx(ceq, ceq->dev->ccq);
 		if (ret_code)
@@ -3774,11 +4525,14 @@ int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq)
 
 	set_64bit_val(wqe, 16, ceq->elem_cnt);
 	set_64bit_val(wqe, 48, ceq->first_pm_pbl_idx);
+	set_64bit_val(wqe, 56,
+		      FIELD_PREP(IRDMA_CQPSQ_PASID, ceq->pasid));
 	hdr = ceq->ceq_id |
 	      FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_CEQ) |
 	      FIELD_PREP(IRDMA_CQPSQ_CEQ_LPBLSIZE, ceq->pbl_chunk_size) |
 	      FIELD_PREP(IRDMA_CQPSQ_CEQ_VMAP, ceq->virtual_map) |
 	      FIELD_PREP(IRDMA_CQPSQ_TPHEN, ceq->tph_en) |
+	      FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, ceq->pasid_valid) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -3942,10 +4696,13 @@ static int irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, u64 scratch,
 		      (aeq->virtual_map ? 0 : aeq->aeq_elem_pa));
 	set_64bit_val(wqe, 48,
 		      (aeq->virtual_map ? aeq->first_pm_pbl_idx : 0));
+	set_64bit_val(wqe, 56,
+		      FIELD_PREP(IRDMA_CQPSQ_PASID, aeq->pasid));
 
 	hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_CREATE_AEQ) |
 	      FIELD_PREP(IRDMA_CQPSQ_AEQ_LPBLSIZE, aeq->pbl_chunk_size) |
 	      FIELD_PREP(IRDMA_CQPSQ_AEQ_VMAP, aeq->virtual_map) |
+	      FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, aeq->pasid_valid) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -3974,7 +4731,8 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch,
 	u64 hdr;
 
 	dev = aeq->dev;
-	writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]);
+	if (dev->privileged)
+		writel(0, dev->hw_regs[IRDMA_PFINT_AEQCTL]);
 
 	cqp = dev->cqp;
 	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
@@ -3982,9 +4740,12 @@ static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch,
 		return -ENOMEM;
 	set_64bit_val(wqe, 16, aeq->elem_cnt);
 	set_64bit_val(wqe, 48, aeq->first_pm_pbl_idx);
+	set_64bit_val(wqe, 56,
+		      FIELD_PREP(IRDMA_CQPSQ_PASID, aeq->pasid));
 	hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_AEQ) |
 	      FIELD_PREP(IRDMA_CQPSQ_AEQ_LPBLSIZE, aeq->pbl_chunk_size) |
 	      FIELD_PREP(IRDMA_CQPSQ_AEQ_VMAP, aeq->virtual_map) |
+	      FIELD_PREP(IRDMA_CQPSQ_PASID_VALID, aeq->pasid_valid) |
 	      FIELD_PREP(IRDMA_CQPSQ_WQEVALID, cqp->polarity);
 	dma_wmb(); /* make sure WQE is written before valid bit is set */
 
@@ -4025,18 +4786,39 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq,
 	print_hex_dump_debug("WQE: AEQ_ENTRY WQE", DUMP_PREFIX_OFFSET, 16, 8,
 			     aeqe, 16, false);
 
-	ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC, temp);
-	info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX, temp);
-	info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_LOW, temp) |
+	if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC_GEN_3, temp);
+		info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX_GEN_3,
+					       temp);
+		info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_GEN_3, temp);
+		info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE_GEN_3, temp);
+		info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE_GEN_3, compl_ctx);
+		info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE_GEN_3, temp);
+		info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA_GEN_3, compl_ctx);
+		info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW_GEN_3, temp);
+		info->compl_ctx = FIELD_GET(IRDMA_AEQE_CMPL_CTXT, compl_ctx);
+		compl_ctx = FIELD_GET(IRDMA_AEQE_CMPL_CTXT, compl_ctx) << IRDMA_AEQE_CMPL_CTXT_S;
+	} else {
+		ae_src = (u8)FIELD_GET(IRDMA_AEQE_AESRC, temp);
+		info->wqe_idx = (u16)FIELD_GET(IRDMA_AEQE_WQDESCIDX, temp);
+		info->qp_cq_id = (u32)FIELD_GET(IRDMA_AEQE_QPCQID_LOW, temp) |
 			 ((u32)FIELD_GET(IRDMA_AEQE_QPCQID_HI, temp) << 18);
-	info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE, temp);
-	info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE, temp);
-	info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE, temp);
-	info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA, temp);
-	info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW, temp);
+		info->ae_id = (u16)FIELD_GET(IRDMA_AEQE_AECODE, temp);
+		info->tcp_state = (u8)FIELD_GET(IRDMA_AEQE_TCPSTATE, temp);
+		info->iwarp_state = (u8)FIELD_GET(IRDMA_AEQE_IWSTATE, temp);
+		info->q2_data_written = (u8)FIELD_GET(IRDMA_AEQE_Q2DATA, temp);
+		info->aeqe_overflow = (bool)FIELD_GET(IRDMA_AEQE_OVERFLOW,
+						      temp);
+	}
 
 	info->ae_src = ae_src;
 	switch (info->ae_id) {
+	case IRDMA_AE_SRQ_LIMIT:
+		info->srq = true;
+		/* [63:6] from CMPL_CTXT, [5:0] from WQDESCIDX. */
+		info->compl_ctx = compl_ctx;
+		ae_src = IRDMA_AE_SOURCE_RSVD;
+		break;
 	case IRDMA_AE_PRIV_OPERATION_DENIED:
 	case IRDMA_AE_AMP_INVALIDATE_TYPE1_MW:
 	case IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW:
@@ -4069,6 +4851,10 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq,
 	case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR:
 	case IRDMA_AE_LLP_SEGMENT_TOO_SMALL:
 	case IRDMA_AE_LLP_TOO_MANY_RETRIES:
+	case IRDMA_AE_LLP_TOO_MANY_RNRS:
+	case IRDMA_AE_REMOTE_QP_CATASTROPHIC:
+	case IRDMA_AE_LOCAL_QP_CATASTROPHIC:
+	case IRDMA_AE_RCE_QP_CATASTROPHIC:
 	case IRDMA_AE_LLP_DOUBT_REACHABILITY:
 	case IRDMA_AE_LLP_CONNECTION_ESTABLISHED:
 	case IRDMA_AE_RESET_SENT:
@@ -4085,6 +4871,10 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq,
 		info->compl_ctx = compl_ctx << 1;
 		ae_src = IRDMA_AE_SOURCE_RSVD;
 		break;
+	case IRDMA_AE_CQP_DEFERRED_COMPLETE:
+		info->def_info = info->wqe_idx;
+		ae_src = IRDMA_AE_SOURCE_RSVD;
+		break;
 	case IRDMA_AE_ROCE_EMPTY_MCG:
 	case IRDMA_AE_ROCE_BAD_MC_IP_ADDR:
 	case IRDMA_AE_ROCE_BAD_MC_QPID:
@@ -4110,6 +4900,7 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq,
 		info->qp = true;
 		info->rq = true;
 		info->compl_ctx = compl_ctx;
+		info->err_rq_idx_valid = true;
 		break;
 	case IRDMA_AE_SOURCE_CQ:
 	case IRDMA_AE_SOURCE_CQ_0110:
@@ -4125,8 +4916,18 @@ int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq,
 		info->compl_ctx = compl_ctx;
 		break;
 	case IRDMA_AE_SOURCE_IN_RR_WR:
+		info->qp = true;
+		if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+			info->err_rq_idx_valid = true;
+		info->compl_ctx = compl_ctx;
+		info->in_rdrsp_wr = true;
+		break;
 	case IRDMA_AE_SOURCE_IN_RR_WR_1011:
 		info->qp = true;
+		if (aeq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+			info->sq = true;
+			info->err_rq_idx_valid = true;
+		}
 		info->compl_ctx = compl_ctx;
 		info->in_rdrsp_wr = true;
 		break;
@@ -4336,6 +5137,26 @@ int irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, u8 hmc_fn_id)
 }
 
 /**
+ * irdma_set_loc_mem() - set a local memory bit field
+ * @buf: ptr to a buffer where local memory gets enabled
+ */
+static void irdma_set_loc_mem(__le64 *buf)
+{
+	u64 loc_mem_en = BIT_ULL(ENABLE_LOC_MEM);
+	u32 offset;
+	u64 temp;
+
+	for (offset = 0; offset < IRDMA_COMMIT_FPM_BUF_SIZE;
+	     offset += sizeof(__le64)) {
+		if (offset == IRDMA_PBLE_COMMIT_OFFSET)
+			continue;
+		get_64bit_val(buf, offset, &temp);
+		if (temp)
+			set_64bit_val(buf, offset, temp | loc_mem_en);
+	}
+}
+
+/**
  * irdma_sc_cfg_iw_fpm() - commits hmc obj cnt values using cqp
  * command and populates fpm base address in hmc_info
  * @dev : ptr to irdma_dev struct
@@ -4356,7 +5177,7 @@ static int irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id)
 
 	set_64bit_val(buf, 0, (u64)obj_info[IRDMA_HMC_IW_QP].cnt);
 	set_64bit_val(buf, 8, (u64)obj_info[IRDMA_HMC_IW_CQ].cnt);
-	set_64bit_val(buf, 16, (u64)0); /* RSRVD */
+	set_64bit_val(buf, 16, (u64)obj_info[IRDMA_HMC_IW_SRQ].cnt);
 	set_64bit_val(buf, 24, (u64)obj_info[IRDMA_HMC_IW_HTE].cnt);
 	set_64bit_val(buf, 32, (u64)obj_info[IRDMA_HMC_IW_ARP].cnt);
 	set_64bit_val(buf, 40, (u64)0); /* RSVD */
@@ -4383,7 +5204,9 @@ static int irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id)
 		      (u64)obj_info[IRDMA_HMC_IW_OOISC].cnt);
 	set_64bit_val(buf, 168,
 		      (u64)obj_info[IRDMA_HMC_IW_OOISCFFL].cnt);
-
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3 &&
+	    dev->hmc_fpm_misc.loc_mem_pages)
+		irdma_set_loc_mem(buf);
 	commit_fpm_mem.pa = dev->fpm_commit_buf_pa;
 	commit_fpm_mem.va = dev->fpm_commit_buf;
 
@@ -4592,6 +5415,7 @@ static bool irdma_cqp_ring_full(struct irdma_sc_cqp *cqp)
 static u32 irdma_est_sd(struct irdma_sc_dev *dev,
 			struct irdma_hmc_info *hmc_info)
 {
+	struct irdma_hmc_obj_info *pble_info;
 	int i;
 	u64 size = 0;
 	u64 sd;
@@ -4600,12 +5424,22 @@ static u32 irdma_est_sd(struct irdma_sc_dev *dev,
 		if (i != IRDMA_HMC_IW_PBLE)
 			size += round_up(hmc_info->hmc_obj[i].cnt *
 					 hmc_info->hmc_obj[i].size, 512);
-	size += round_up(hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt *
-			 hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].size, 512);
+
+	pble_info = &hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE];
+	if (dev->privileged)
+		size += round_up(pble_info->cnt * pble_info->size, 512);
 	if (size & 0x1FFFFF)
 		sd = (size >> 21) + 1; /* add 1 for remainder */
 	else
 		sd = size >> 21;
+	if (!dev->privileged && !dev->hmc_fpm_misc.loc_mem_pages) {
+		/* 2MB alignment for VF PBLE HMC */
+		size = pble_info->cnt * pble_info->size;
+		if (size & 0x1FFFFF)
+			sd += (size >> 21) + 1; /* add 1 for remainder */
+		else
+			sd += size >> 21;
+	}
 	if (sd > 0xFFFFFFFF) {
 		ibdev_dbg(to_ibdev(dev), "HMC: sd overflow[%lld]\n", sd);
 		sd = 0xFFFFFFFF - 1;
@@ -4615,17 +5449,6 @@ static u32 irdma_est_sd(struct irdma_sc_dev *dev,
 }
 
 /**
- * irdma_sc_query_rdma_features_done - poll cqp for query features done
- * @cqp: struct for cqp hw
- */
-static int irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp)
-{
-	return irdma_sc_poll_for_cqp_op_done(cqp,
-					     IRDMA_CQP_OP_QUERY_RDMA_FEATURES,
-					     NULL);
-}
-
-/**
  * irdma_sc_query_rdma_features - query RDMA features and FW ver
  * @cqp: struct for cqp hw
  * @buf: buffer to hold query info
@@ -4634,7 +5457,9 @@ static int irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp)
 static int irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp,
 					struct irdma_dma_mem *buf, u64 scratch)
 {
+	u32 tail, val, error;
 	__le64 *wqe;
+	int status;
 	u64 temp;
 
 	wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch);
@@ -4654,9 +5479,15 @@ static int irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp,
 
 	print_hex_dump_debug("WQE: QUERY RDMA FEATURES", DUMP_PREFIX_OFFSET,
 			     16, 8, wqe, IRDMA_CQP_WQE_SIZE * 8, false);
+	irdma_get_cqp_reg_info(cqp, &val, &tail, &error);
+
 	irdma_sc_cqp_post_sq(cqp);
+	status = irdma_cqp_poll_registers(cqp, tail,
+					  cqp->dev->hw_attrs.max_done_count);
+	if (error || status)
+		status = -EINVAL;
 
-	return 0;
+	return status;
 }
 
 /**
@@ -4678,8 +5509,6 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev)
 		return -ENOMEM;
 
 	ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0);
-	if (!ret_code)
-		ret_code = irdma_sc_query_rdma_features_done(dev->cqp);
 	if (ret_code)
 		goto exit;
 
@@ -4703,8 +5532,6 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev)
 			return -ENOMEM;
 
 		ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0);
-		if (!ret_code)
-			ret_code = irdma_sc_query_rdma_features_done(dev->cqp);
 		if (ret_code)
 			goto exit;
 
@@ -4731,6 +5558,10 @@ int irdma_get_rdma_features(struct irdma_sc_dev *dev)
 		}
 		dev->feature_info[feat_type] = temp;
 	}
+
+	if (dev->feature_info[IRDMA_FTN_FLAGS] & IRDMA_ATOMICS_ALLOWED_BIT)
+		dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_ATOMIC_OPS;
+
 exit:
 	dma_free_coherent(dev->hw->device, feat_buf.size, feat_buf.va,
 			  feat_buf.pa);
@@ -4786,22 +5617,354 @@ static void cfg_fpm_value_gen_2(struct irdma_sc_dev *dev,
 }
 
 /**
+ * irdma_get_rsrc_mem_config - configure resources if local memory or host
+ * @dev: sc device struct
+ * @is_mrte_loc_mem: if true, MR's to be in local memory because sd=loc pages
+ *
+ * Only mr can be configured host or local memory if qp's are in local memory.
+ * If qp is in local memory, then all resource object will be in local memory
+ * except mr which can be either host or local memory.  The only exception
+ * is pble's which are always in host memory.
+ */
+static void irdma_get_rsrc_mem_config(struct irdma_sc_dev *dev, bool is_mrte_loc_mem)
+{
+	struct irdma_hmc_info *hmc_info = dev->hmc_info;
+	int i;
+
+	for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++)
+		hmc_info->hmc_obj[i].mem_loc = IRDMA_LOC_MEM;
+
+	if (dev->feature_info[IRDMA_OBJ_1] && !is_mrte_loc_mem) {
+		u8 mem_type;
+
+		mem_type = (u8)FIELD_GET(IRDMA_MR_MEM_LOC, dev->feature_info[IRDMA_OBJ_1]);
+
+		hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc =
+			(mem_type & IRDMA_OBJ_LOC_MEM_BIT) ?
+			IRDMA_LOC_MEM : IRDMA_HOST_MEM;
+	} else {
+		hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc = IRDMA_LOC_MEM;
+	}
+
+	hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].mem_loc = IRDMA_HOST_MEM;
+
+	ibdev_dbg(to_ibdev(dev), "HMC: INFO: mrte_mem_loc = %d pble = %d\n",
+		  hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc,
+		  hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].mem_loc);
+}
+
+/**
+ * irdma_cfg_sd_mem - allocate sd memory
+ * @dev: sc device struct
+ * @hmc_info: ptr to irdma_hmc_obj_info struct
+ */
+static int irdma_cfg_sd_mem(struct irdma_sc_dev *dev,
+			    struct irdma_hmc_info *hmc_info)
+{
+	struct irdma_virt_mem virt_mem;
+	u32 mem_size;
+
+	mem_size = sizeof(struct irdma_hmc_sd_entry) * hmc_info->sd_table.sd_cnt;
+	virt_mem.size = mem_size;
+	virt_mem.va = kzalloc(virt_mem.size, GFP_KERNEL);
+	if (!virt_mem.va)
+		return -ENOMEM;
+	hmc_info->sd_table.sd_entry = virt_mem.va;
+
+	return 0;
+}
+
+/**
+ * irdma_get_objs_pages - get number of 2M pages needed
+ * @dev: sc device struct
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @mem_loc: pages for local or host memory
+ */
+static u32 irdma_get_objs_pages(struct irdma_sc_dev *dev,
+				struct irdma_hmc_info *hmc_info,
+				enum irdma_hmc_obj_mem mem_loc)
+{
+	u64 size = 0;
+	int i;
+
+	for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++) {
+		if (hmc_info->hmc_obj[i].mem_loc == mem_loc) {
+			size += round_up(hmc_info->hmc_obj[i].cnt *
+					 hmc_info->hmc_obj[i].size, 512);
+		}
+	}
+
+	return DIV_ROUND_UP(size, IRDMA_HMC_PAGE_SIZE);
+}
+
+/**
+ * irdma_set_host_hmc_rsrc_gen_3 - calculate host hmc resources for gen 3
+ * @dev: sc device struct
+ */
+static void irdma_set_host_hmc_rsrc_gen_3(struct irdma_sc_dev *dev)
+{
+	struct irdma_hmc_fpm_misc *hmc_fpm_misc;
+	struct irdma_hmc_info *hmc_info;
+	enum irdma_hmc_obj_mem mrte_loc;
+	u32 mrwanted, pblewanted;
+	u32  avail_sds, mr_sds;
+
+	hmc_info = dev->hmc_info;
+	hmc_fpm_misc = &dev->hmc_fpm_misc;
+	avail_sds = hmc_fpm_misc->max_sds;
+	mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc;
+	mrwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt;
+	pblewanted = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt;
+
+	if (mrte_loc == IRDMA_HOST_MEM && avail_sds > IRDMA_MIN_PBLE_PAGES) {
+		mr_sds = avail_sds - IRDMA_MIN_PBLE_PAGES;
+		mrwanted = min(mrwanted, mr_sds * MAX_MR_PER_SD);
+		hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = mrwanted;
+		avail_sds -= DIV_ROUND_UP(mrwanted, MAX_MR_PER_SD);
+	}
+
+	if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS]) &&
+	    pblewanted > avail_sds * MAX_PBLE_PER_SD)
+		ibdev_dbg(to_ibdev(dev),
+			  "HMC: Warn: Resource version 2: pble wanted = 0x%x available = 0x%x\n",
+			  pblewanted, avail_sds * MAX_PBLE_PER_SD);
+
+	pblewanted = min(pblewanted, avail_sds * MAX_PBLE_PER_SD);
+	hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt = pblewanted;
+}
+
+/**
+ * irdma_verify_commit_fpm_gen_3 - verify query fpm values
+ * @dev: sc device struct
+ * @max_pages: max local memory available
+ * @qpwanted: number of qp's wanted
+ */
+static int irdma_verify_commit_fpm_gen_3(struct irdma_sc_dev *dev,
+					 u32 max_pages,
+					 u32 qpwanted)
+{
+	struct irdma_hmc_fpm_misc *hmc_fpm_misc;
+	u32 rrf_cnt, xf_cnt, timer_cnt, pages_needed;
+	struct irdma_hmc_info *hmc_info;
+	u32 rrffl_cnt = 0;
+	u32 xffl_cnt = 0;
+	u32 q1fl_cnt;
+
+	hmc_info = dev->hmc_info;
+	hmc_fpm_misc = &dev->hmc_fpm_misc;
+
+	rrf_cnt = roundup_pow_of_two(IRDMA_RRF_MULTIPLIER * qpwanted);
+
+	if (hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].max_cnt)
+		rrffl_cnt =
+			hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt /
+			hmc_fpm_misc->rrf_block_size;
+
+	xf_cnt = roundup_pow_of_two(IRDMA_XF_MULTIPLIER * qpwanted);
+
+	if (xf_cnt)
+		xffl_cnt = xf_cnt / hmc_fpm_misc->xf_block_size;
+
+	timer_cnt = (round_up(qpwanted, 512) / 512 + 1) *
+		hmc_fpm_misc->timer_bucket;
+
+	q1fl_cnt = hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size;
+
+	pages_needed = irdma_get_objs_pages(dev, hmc_info, IRDMA_LOC_MEM);
+	if (pages_needed > max_pages) {
+		ibdev_dbg(to_ibdev(dev),
+			  "HMC: FAIL: SW counts rrf_cnt = %u rrffl_cnt = %u timer_cnt = %u",
+			  rrf_cnt, rrffl_cnt, timer_cnt);
+		ibdev_dbg(to_ibdev(dev),
+			  "HMC: FAIL: SW counts xf_cnt = %u xffl_cnt = %u q1fl_cnt = %u",
+			  xf_cnt, xffl_cnt, q1fl_cnt);
+
+		return -EINVAL;
+	}
+
+	hmc_fpm_misc->max_sds -= pages_needed;
+	hmc_fpm_misc->loc_mem_pages -= pages_needed;
+
+	return 0;
+}
+
+/**
+ * irdma_set_loc_hmc_rsrc_gen_3 - calculate hmc resources for gen 3
+ * @dev: sc device struct
+ * @max_pages: max local memory available
+ * @qpwanted: number of qp's wanted
+ */
+static int irdma_set_loc_hmc_rsrc_gen_3(struct irdma_sc_dev *dev,
+					u32 max_pages,
+					u32 qpwanted)
+{
+	struct irdma_hmc_fpm_misc *hmc_fpm_misc;
+	u32 rrf_cnt, xf_cnt, timer_cnt, pages_needed;
+	struct irdma_hmc_info *hmc_info;
+	u32 ird, ord;
+
+	if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS]))
+		return irdma_verify_commit_fpm_gen_3(dev, max_pages, qpwanted);
+
+	hmc_info = dev->hmc_info;
+	hmc_fpm_misc = &dev->hmc_fpm_misc;
+	ird = dev->hw_attrs.max_hw_ird;
+	ord = dev->hw_attrs.max_hw_ord;
+
+	hmc_info->hmc_obj[IRDMA_HMC_IW_HDR].cnt = qpwanted;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt = qpwanted;
+
+	hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt =
+		min(hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt, qpwanted * 2);
+
+	hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt =
+		min(qpwanted * 8, hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt);
+
+	rrf_cnt = roundup_pow_of_two(IRDMA_RRF_MULTIPLIER * qpwanted);
+	hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt =
+		min(hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].max_cnt, rrf_cnt);
+
+	if (hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].max_cnt)
+		hmc_info->hmc_obj[IRDMA_HMC_IW_RRFFL].cnt =
+			hmc_info->hmc_obj[IRDMA_HMC_IW_RRF].cnt /
+			hmc_fpm_misc->rrf_block_size;
+
+	xf_cnt = roundup_pow_of_two(IRDMA_XF_MULTIPLIER * qpwanted);
+	hmc_info->hmc_obj[IRDMA_HMC_IW_XF].cnt =
+		min(hmc_info->hmc_obj[IRDMA_HMC_IW_XF].max_cnt, xf_cnt);
+	hmc_info->hmc_obj[IRDMA_HMC_IW_XFFL].cnt =
+			xf_cnt / hmc_fpm_misc->xf_block_size;
+
+	timer_cnt = (round_up(qpwanted, 512) / 512 + 1) *
+		hmc_fpm_misc->timer_bucket;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_TIMER].cnt =
+		min(timer_cnt, hmc_info->hmc_obj[IRDMA_HMC_IW_TIMER].cnt);
+
+	do {
+		hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt = roundup_pow_of_two(ird * 2 * qpwanted);
+		hmc_info->hmc_obj[IRDMA_HMC_IW_Q1FL].cnt =
+			hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt / hmc_fpm_misc->q1_block_size;
+
+		pages_needed = irdma_get_objs_pages(dev, hmc_info, IRDMA_LOC_MEM);
+		if (pages_needed <= max_pages)
+			break;
+
+		ird /= 2;
+		ord /= 2;
+	} while (ird >= IRDMA_MIN_IRD);
+
+	if (ird < IRDMA_MIN_IRD) {
+		ibdev_dbg(to_ibdev(dev), "HMC: FAIL: IRD=%u Q1 CNT = %u\n",
+			  ird, hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].cnt);
+		return -EINVAL;
+	}
+
+	dev->hw_attrs.max_hw_ird = ird;
+	dev->hw_attrs.max_hw_ord = ord;
+	hmc_fpm_misc->max_sds -= pages_needed;
+
+	return 0;
+}
+
+/**
+ * cfg_fpm_value_gen_3 - configure fpm for gen 3
+ * @dev: sc device struct
+ * @hmc_info: ptr to irdma_hmc_obj_info struct
+ * @hmc_fpm_misc: ptr to fpm data
+ */
+static int cfg_fpm_value_gen_3(struct irdma_sc_dev *dev,
+			       struct irdma_hmc_info *hmc_info,
+			       struct irdma_hmc_fpm_misc *hmc_fpm_misc)
+{
+	enum irdma_hmc_obj_mem mrte_loc;
+	u32 mrwanted,  qpwanted;
+	int i, ret_code = 0;
+	u32 loc_mem_pages;
+	bool is_mrte_loc_mem;
+
+	loc_mem_pages = hmc_fpm_misc->loc_mem_pages;
+	is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ?
+			true : false;
+
+	irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem);
+	mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc;
+
+	if (is_mrte_loc_mem)
+		loc_mem_pages -= IRDMA_MIN_PBLE_PAGES;
+
+	ibdev_dbg(to_ibdev(dev),
+		  "HMC: mrte_loc %d loc_mem %u fpm max sds %u host_obj %d\n",
+		  hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc,
+		  hmc_fpm_misc->loc_mem_pages, hmc_fpm_misc->max_sds,
+		  is_mrte_loc_mem);
+
+	mrwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt;
+	qpwanted = hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_HDR].cnt = qpwanted;
+
+	hmc_info->hmc_obj[IRDMA_HMC_IW_OOISC].max_cnt = 0;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_OOISCFFL].max_cnt = 0;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_HTE].max_cnt = 0;
+	hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt = 0;
+
+	if (!FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS]))
+		hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt =
+			min(hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt,
+			(u32)IRDMA_FSIAV_CNT_MAX);
+
+	for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++)
+		hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
+
+	while (qpwanted >= IRDMA_MIN_QP_CNT) {
+		if (!irdma_set_loc_hmc_rsrc_gen_3(dev, loc_mem_pages, qpwanted))
+			break;
+
+		if (FIELD_GET(IRDMA_MANAGE_RSRC_VER2, dev->feature_info[IRDMA_FTN_FLAGS]))
+			return -EINVAL;
+
+		qpwanted /= 2;
+		if (mrte_loc == IRDMA_LOC_MEM) {
+			mrwanted = qpwanted * IRDMA_MIN_MR_PER_QP;
+			hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt =
+				min(hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt, mrwanted);
+		}
+	}
+
+	if (qpwanted < IRDMA_MIN_QP_CNT) {
+		ibdev_dbg(to_ibdev(dev),
+			  "HMC: ERROR: could not allocate fpm resources\n");
+		return -EINVAL;
+	}
+
+	irdma_set_host_hmc_rsrc_gen_3(dev);
+	ret_code = irdma_sc_cfg_iw_fpm(dev, dev->hmc_fn_id);
+	if (ret_code) {
+		ibdev_dbg(to_ibdev(dev),
+			  "HMC: cfg_iw_fpm returned error_code[x%08X]\n",
+			  readl(dev->hw_regs[IRDMA_CQPERRCODES]));
+
+		return ret_code;
+	}
+
+	return irdma_cfg_sd_mem(dev, hmc_info);
+}
+
+/**
  * irdma_cfg_fpm_val - configure HMC objects
  * @dev: sc device struct
  * @qp_count: desired qp count
  */
 int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 {
-	struct irdma_virt_mem virt_mem;
-	u32 i, mem_size;
 	u32 qpwanted, mrwanted, pblewanted;
-	u32 powerof2, hte;
+	u32 powerof2, hte, i;
 	u32 sd_needed;
 	u32 sd_diff;
 	u32 loop_count = 0;
 	struct irdma_hmc_info *hmc_info;
 	struct irdma_hmc_fpm_misc *hmc_fpm_misc;
 	int ret_code = 0;
+	u32 max_sds;
 
 	hmc_info = dev->hmc_info;
 	hmc_fpm_misc = &dev->hmc_fpm_misc;
@@ -4814,14 +5977,16 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 		return ret_code;
 	}
 
+	max_sds = hmc_fpm_misc->max_sds;
+
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		return cfg_fpm_value_gen_3(dev, hmc_info, hmc_fpm_misc);
+
 	for (i = IRDMA_HMC_IW_QP; i < IRDMA_HMC_IW_MAX; i++)
 		hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt;
 	sd_needed = irdma_est_sd(dev, hmc_info);
-	ibdev_dbg(to_ibdev(dev),
-		  "HMC: FW max resources sd_needed[%08d] first_sd_index[%04d]\n",
-		  sd_needed, hmc_info->first_sd_index);
-	ibdev_dbg(to_ibdev(dev), "HMC: sd count %d where max sd is %d\n",
-		  hmc_info->sd_table.sd_cnt, hmc_fpm_misc->max_sds);
+	ibdev_dbg(to_ibdev(dev), "HMC: sd count %u where max sd is %u\n",
+		  hmc_info->sd_table.sd_cnt, max_sds);
 
 	qpwanted = min(qp_count, hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt);
 
@@ -4835,21 +6000,21 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 	pblewanted = hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt;
 
 	ibdev_dbg(to_ibdev(dev),
-		  "HMC: req_qp=%d max_sd=%d, max_qp = %d, max_cq=%d, max_mr=%d, max_pble=%d, mc=%d, av=%d\n",
-		  qp_count, hmc_fpm_misc->max_sds,
+		  "HMC: req_qp=%d max_sd=%u, max_qp = %u, max_cq=%u, max_mr=%u, max_pble=%u, mc=%d, av=%u\n",
+		  qp_count, max_sds,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_MR].max_cnt,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].max_cnt,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt,
 		  hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt);
+
 	hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].cnt =
 		hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].max_cnt;
 	hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt =
 		hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].max_cnt;
 	hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].cnt =
 		hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].max_cnt;
-
 	hmc_info->hmc_obj[IRDMA_HMC_IW_APBVT_ENTRY].cnt = 1;
 
 	while (irdma_q1_cnt(dev, hmc_info, qpwanted) > hmc_info->hmc_obj[IRDMA_HMC_IW_Q1].max_cnt)
@@ -4860,7 +6025,7 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 		hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt = qpwanted;
 		hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt =
 			min(2 * qpwanted, hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt);
-		hmc_info->hmc_obj[IRDMA_HMC_IW_RESERVED].cnt = 0; /* Reserved */
+		hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].cnt = 0; /* Reserved */
 		hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt = mrwanted;
 
 		hte = round_up(qpwanted + hmc_info->hmc_obj[IRDMA_HMC_IW_FSIMC].cnt, 512);
@@ -4898,11 +6063,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 			if (!(loop_count % 2) && qpwanted > 128) {
 				qpwanted /= 2;
 			} else {
-				mrwanted /= 2;
 				pblewanted /= 2;
+				mrwanted /= 2;
 			}
 			continue;
 		}
+
 		if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF &&
 		    pblewanted > (512 * FPM_MULTIPLIER * sd_diff)) {
 			pblewanted -= 256 * FPM_MULTIPLIER * sd_diff;
@@ -4928,14 +6094,13 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 
 	if (sd_needed > hmc_fpm_misc->max_sds) {
 		ibdev_dbg(to_ibdev(dev),
-			  "HMC: cfg_fpm failed loop_cnt=%d, sd_needed=%d, max sd count %d\n",
+			  "HMC: cfg_fpm failed loop_cnt=%u, sd_needed=%u, max sd count %u\n",
 			  loop_count, sd_needed, hmc_info->sd_table.sd_cnt);
 		return -EINVAL;
 	}
 
-	if (loop_count > 1 && sd_needed < hmc_fpm_misc->max_sds) {
-		pblewanted += (hmc_fpm_misc->max_sds - sd_needed) * 256 *
-			      FPM_MULTIPLIER;
+	if (loop_count > 1 && sd_needed < max_sds) {
+		pblewanted += (max_sds - sd_needed) * 256 * FPM_MULTIPLIER;
 		hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt = pblewanted;
 		sd_needed = irdma_est_sd(dev, hmc_info);
 	}
@@ -4959,18 +6124,7 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count)
 		return ret_code;
 	}
 
-	mem_size = sizeof(struct irdma_hmc_sd_entry) *
-		   (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1);
-	virt_mem.size = mem_size;
-	virt_mem.va = kzalloc(virt_mem.size, GFP_KERNEL);
-	if (!virt_mem.va) {
-		ibdev_dbg(to_ibdev(dev),
-			  "HMC: failed to allocate memory for sd_entry buffer\n");
-		return -ENOMEM;
-	}
-	hmc_info->sd_table.sd_entry = virt_mem.va;
-
-	return ret_code;
+	return irdma_cfg_sd_mem(dev, hmc_info);
 }
 
 /**
@@ -5242,6 +6396,22 @@ static int irdma_exec_cqp_cmd(struct irdma_sc_dev *dev,
 						   &pcmdinfo->in.u.mc_modify.info,
 						   pcmdinfo->in.u.mc_modify.scratch);
 		break;
+	case IRDMA_OP_SRQ_CREATE:
+		status = irdma_sc_srq_create(pcmdinfo->in.u.srq_create.srq,
+					     pcmdinfo->in.u.srq_create.scratch,
+					     pcmdinfo->post_sq);
+		break;
+	case IRDMA_OP_SRQ_MODIFY:
+		status = irdma_sc_srq_modify(pcmdinfo->in.u.srq_modify.srq,
+					     &pcmdinfo->in.u.srq_modify.info,
+					     pcmdinfo->in.u.srq_modify.scratch,
+					     pcmdinfo->post_sq);
+		break;
+	case IRDMA_OP_SRQ_DESTROY:
+		status = irdma_sc_srq_destroy(pcmdinfo->in.u.srq_destroy.srq,
+					      pcmdinfo->in.u.srq_destroy.scratch,
+					      pcmdinfo->post_sq);
+		break;
 	default:
 		status = -EOPNOTSUPP;
 		break;
@@ -5314,14 +6484,26 @@ void irdma_cfg_aeq(struct irdma_sc_dev *dev, u32 idx, bool enable)
  */
 void sc_vsi_update_stats(struct irdma_sc_vsi *vsi)
 {
-	struct irdma_gather_stats *gather_stats;
-	struct irdma_gather_stats *last_gather_stats;
+	struct irdma_dev_hw_stats *hw_stats = &vsi->pestat->hw_stats;
+	struct irdma_gather_stats *gather_stats =
+		vsi->pestat->gather_info.gather_stats_va;
+	struct irdma_gather_stats *last_gather_stats =
+		vsi->pestat->gather_info.last_gather_stats_va;
+	const struct irdma_hw_stat_map *map = vsi->dev->hw_stats_map;
+	u16 max_stat_idx = vsi->dev->hw_attrs.max_stat_idx;
+	u16 i;
+
+	if (vsi->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		for (i = 0; i < max_stat_idx; i++) {
+			u16 idx = map[i].byteoff / sizeof(u64);
+
+			hw_stats->stats_val[i] = gather_stats->val[idx];
+		}
+		return;
+	}
 
-	gather_stats = vsi->pestat->gather_info.gather_stats_va;
-	last_gather_stats = vsi->pestat->gather_info.last_gather_stats_va;
-	irdma_update_stats(&vsi->pestat->hw_stats, gather_stats,
-			   last_gather_stats, vsi->dev->hw_stats_map,
-			   vsi->dev->hw_attrs.max_stat_idx);
+	irdma_update_stats(hw_stats, gather_stats, last_gather_stats,
+			   map, max_stat_idx);
 }
 
 /**
@@ -5356,6 +6538,9 @@ static inline void irdma_sc_init_hw(struct irdma_sc_dev *dev)
 	case IRDMA_GEN_2:
 		icrdma_init_hw(dev);
 		break;
+	case IRDMA_GEN_3:
+		ig3rdma_init_hw(dev);
+		break;
 	}
 }
 
@@ -5381,10 +6566,15 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev,
 	dev->fpm_commit_buf = info->fpm_commit_buf;
 	dev->hw = info->hw;
 	dev->hw->hw_addr = info->bar0;
+	dev->protocol_used = info->protocol_used;
 	/* Setup the hardware limits, hmc may limit further */
 	dev->hw_attrs.min_hw_qp_id = IRDMA_MIN_IW_QP_ID;
+	dev->hw_attrs.min_hw_srq_id = IRDMA_MIN_IW_SRQ_ID;
 	dev->hw_attrs.min_hw_aeq_size = IRDMA_MIN_AEQ_ENTRIES;
-	dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES;
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES_GEN_3;
+	else
+		dev->hw_attrs.max_hw_aeq_size = IRDMA_MAX_AEQ_ENTRIES;
 	dev->hw_attrs.min_hw_ceq_size = IRDMA_MIN_CEQ_ENTRIES;
 	dev->hw_attrs.max_hw_ceq_size = IRDMA_MAX_CEQ_ENTRIES;
 	dev->hw_attrs.uk_attrs.min_hw_cq_size = IRDMA_MIN_CQ_SIZE;
@@ -5409,21 +6599,39 @@ int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev,
 	dev->hw_attrs.max_sleep_count = IRDMA_SLEEP_COUNT;
 	dev->hw_attrs.max_cqp_compl_wait_time_ms = CQP_COMPL_WAIT_TIME_MS;
 
-	dev->hw_attrs.uk_attrs.hw_rev = ver;
+	if (!dev->privileged) {
+		ret_code = irdma_vchnl_req_get_hmc_fcn(dev);
+		if (ret_code) {
+			ibdev_dbg(to_ibdev(dev),
+				  "DEV: Get HMC function ret = %d\n",
+				  ret_code);
+
+			return ret_code;
+		}
+	}
+
 	irdma_sc_init_hw(dev);
 
-	if (irdma_wait_pe_ready(dev))
-		return -ETIMEDOUT;
+	if (dev->privileged) {
+		if (irdma_wait_pe_ready(dev))
+			return -ETIMEDOUT;
 
-	val = readl(dev->hw_regs[IRDMA_GLPCI_LBARCTRL]);
-	db_size = (u8)FIELD_GET(IRDMA_GLPCI_LBARCTRL_PE_DB_SIZE, val);
-	if (db_size != IRDMA_PE_DB_SIZE_4M && db_size != IRDMA_PE_DB_SIZE_8M) {
-		ibdev_dbg(to_ibdev(dev),
-			  "DEV: RDMA PE doorbell is not enabled in CSR val 0x%x db_size=%d\n",
-			  val, db_size);
-		return -ENODEV;
+		val = readl(dev->hw_regs[IRDMA_GLPCI_LBARCTRL]);
+		db_size = (u8)FIELD_GET(IRDMA_GLPCI_LBARCTRL_PE_DB_SIZE, val);
+		if (db_size != IRDMA_PE_DB_SIZE_4M &&
+		    db_size != IRDMA_PE_DB_SIZE_8M) {
+			ibdev_dbg(to_ibdev(dev),
+				  "DEV: RDMA PE doorbell is not enabled in CSR val 0x%x db_size=%d\n",
+				  val, db_size);
+			return -ENODEV;
+			}
+	} else {
+		ret_code = irdma_vchnl_req_get_reg_layout(dev);
+		if (ret_code)
+			ibdev_dbg(to_ibdev(dev),
+				  "DEV: Get Register layout failed ret = %d\n",
+				  ret_code);
 	}
-	dev->db_addr = dev->hw->hw_addr + (uintptr_t)dev->hw_regs[IRDMA_DB_ADDR_OFFSET];
 
 	return ret_code;
 }
diff --git a/drivers/infiniband/hw/irdma/defs.h b/drivers/infiniband/hw/irdma/defs.h
index 2cb4b96db721..983b22d7ae23 100644
--- a/drivers/infiniband/hw/irdma/defs.h
+++ b/drivers/infiniband/hw/irdma/defs.h
@@ -14,6 +14,18 @@
 #define IRDMA_PE_DB_SIZE_4M	1
 #define IRDMA_PE_DB_SIZE_8M	2
 
+#define IRDMA_IRD_HW_SIZE_4_GEN3	0
+#define IRDMA_IRD_HW_SIZE_8_GEN3	1
+#define IRDMA_IRD_HW_SIZE_16_GEN3	2
+#define IRDMA_IRD_HW_SIZE_32_GEN3	3
+#define IRDMA_IRD_HW_SIZE_64_GEN3	4
+#define IRDMA_IRD_HW_SIZE_128_GEN3	5
+#define IRDMA_IRD_HW_SIZE_256_GEN3	6
+#define IRDMA_IRD_HW_SIZE_512_GEN3	7
+#define IRDMA_IRD_HW_SIZE_1024_GEN3	8
+#define IRDMA_IRD_HW_SIZE_2048_GEN3	9
+#define IRDMA_IRD_HW_SIZE_4096_GEN3	10
+
 #define IRDMA_IRD_HW_SIZE_4	0
 #define IRDMA_IRD_HW_SIZE_16	1
 #define IRDMA_IRD_HW_SIZE_64	2
@@ -114,6 +126,13 @@ enum irdma_protocol_used {
 #define IRDMA_UPDATE_SD_BUFF_SIZE	128
 #define IRDMA_FEATURE_BUF_SIZE		(8 * IRDMA_MAX_FEATURES)
 
+#define ENABLE_LOC_MEM			63
+#define IRDMA_ATOMICS_ALLOWED_BIT	1
+#define MAX_PBLE_PER_SD			0x40000
+#define MAX_PBLE_SD_PER_FCN		0x400
+#define MAX_MR_PER_SD			0x8000
+#define MAX_MR_SD_PER_FCN		0x80
+#define IRDMA_PBLE_COMMIT_OFFSET	112
 #define IRDMA_MAX_QUANTA_PER_WR	8
 
 #define IRDMA_QP_SW_MAX_WQ_QUANTA	32768
@@ -121,6 +140,10 @@ enum irdma_protocol_used {
 #define IRDMA_QP_SW_MAX_RQ_QUANTA	32768
 #define IRDMA_MAX_QP_WRS(max_quanta_per_wr) \
 	((IRDMA_QP_SW_MAX_WQ_QUANTA - IRDMA_SQ_RSVD) / (max_quanta_per_wr))
+#define IRDMA_SRQ_MIN_QUANTA 8
+#define IRDMA_SRQ_MAX_QUANTA 262144
+#define IRDMA_MAX_SRQ_WRS \
+	((IRDMA_SRQ_MAX_QUANTA - IRDMA_RQ_RSVD) / IRDMA_MAX_QUANTA_PER_WR)
 
 #define IRDMAQP_TERM_SEND_TERM_AND_FIN		0
 #define IRDMAQP_TERM_SEND_TERM_ONLY		1
@@ -147,8 +170,13 @@ enum irdma_protocol_used {
 #define IRDMA_SQ_RSVD	258
 #define IRDMA_RQ_RSVD	1
 
-#define IRDMA_FEATURE_RTS_AE			1ULL
-#define IRDMA_FEATURE_CQ_RESIZE			2ULL
+#define IRDMA_FEATURE_RTS_AE			BIT_ULL(0)
+#define IRDMA_FEATURE_CQ_RESIZE			BIT_ULL(1)
+#define IRDMA_FEATURE_64_BYTE_CQE		BIT_ULL(5)
+#define IRDMA_FEATURE_ATOMIC_OPS		BIT_ULL(6)
+#define IRDMA_FEATURE_SRQ			BIT_ULL(7)
+#define IRDMA_FEATURE_CQE_TIMESTAMPING		BIT_ULL(8)
+
 #define IRDMAQP_OP_RDMA_WRITE			0x00
 #define IRDMAQP_OP_RDMA_READ			0x01
 #define IRDMAQP_OP_RDMA_SEND			0x03
@@ -161,6 +189,8 @@ enum irdma_protocol_used {
 #define IRDMAQP_OP_RDMA_READ_LOC_INV		0x0b
 #define IRDMAQP_OP_NOP				0x0c
 #define IRDMAQP_OP_RDMA_WRITE_SOL		0x0d
+#define IRDMAQP_OP_ATOMIC_FETCH_ADD		0x0f
+#define IRDMAQP_OP_ATOMIC_COMPARE_SWAP_ADD	0x11
 #define IRDMAQP_OP_GEN_RTS_AE			0x30
 
 enum irdma_cqp_op_type {
@@ -212,9 +242,12 @@ enum irdma_cqp_op_type {
 	IRDMA_OP_ADD_LOCAL_MAC_ENTRY		= 46,
 	IRDMA_OP_DELETE_LOCAL_MAC_ENTRY		= 47,
 	IRDMA_OP_CQ_MODIFY			= 48,
+	IRDMA_OP_SRQ_CREATE			= 49,
+	IRDMA_OP_SRQ_MODIFY			= 50,
+	IRDMA_OP_SRQ_DESTROY			= 51,
 
 	/* Must be last entry*/
-	IRDMA_MAX_CQP_OPS			= 49,
+	IRDMA_MAX_CQP_OPS			= 52,
 };
 
 /* CQP SQ WQES */
@@ -224,6 +257,9 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQP_OP_CREATE_CQ				0x03
 #define IRDMA_CQP_OP_MODIFY_CQ				0x04
 #define IRDMA_CQP_OP_DESTROY_CQ				0x05
+#define IRDMA_CQP_OP_CREATE_SRQ				0x06
+#define IRDMA_CQP_OP_MODIFY_SRQ				0x07
+#define IRDMA_CQP_OP_DESTROY_SRQ			0x08
 #define IRDMA_CQP_OP_ALLOC_STAG				0x09
 #define IRDMA_CQP_OP_REG_MR				0x0a
 #define IRDMA_CQP_OP_QUERY_STAG				0x0b
@@ -265,97 +301,6 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQP_OP_GATHER_STATS			0x2e
 #define IRDMA_CQP_OP_UP_MAP				0x2f
 
-/* Async Events codes */
-#define IRDMA_AE_AMP_UNALLOCATED_STAG					0x0102
-#define IRDMA_AE_AMP_INVALID_STAG					0x0103
-#define IRDMA_AE_AMP_BAD_QP						0x0104
-#define IRDMA_AE_AMP_BAD_PD						0x0105
-#define IRDMA_AE_AMP_BAD_STAG_KEY					0x0106
-#define IRDMA_AE_AMP_BAD_STAG_INDEX					0x0107
-#define IRDMA_AE_AMP_BOUNDS_VIOLATION					0x0108
-#define IRDMA_AE_AMP_RIGHTS_VIOLATION					0x0109
-#define IRDMA_AE_AMP_TO_WRAP						0x010a
-#define IRDMA_AE_AMP_FASTREG_VALID_STAG					0x010c
-#define IRDMA_AE_AMP_FASTREG_MW_STAG					0x010d
-#define IRDMA_AE_AMP_FASTREG_INVALID_RIGHTS				0x010e
-#define IRDMA_AE_AMP_FASTREG_INVALID_LENGTH				0x0110
-#define IRDMA_AE_AMP_INVALIDATE_SHARED					0x0111
-#define IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS			0x0112
-#define IRDMA_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS			0x0113
-#define IRDMA_AE_AMP_MWBIND_VALID_STAG					0x0114
-#define IRDMA_AE_AMP_MWBIND_OF_MR_STAG					0x0115
-#define IRDMA_AE_AMP_MWBIND_TO_ZERO_BASED_STAG				0x0116
-#define IRDMA_AE_AMP_MWBIND_TO_MW_STAG					0x0117
-#define IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS				0x0118
-#define IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS				0x0119
-#define IRDMA_AE_AMP_MWBIND_TO_INVALID_PARENT				0x011a
-#define IRDMA_AE_AMP_MWBIND_BIND_DISABLED				0x011b
-#define IRDMA_AE_PRIV_OPERATION_DENIED					0x011c
-#define IRDMA_AE_AMP_INVALIDATE_TYPE1_MW				0x011d
-#define IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW				0x011e
-#define IRDMA_AE_AMP_FASTREG_INVALID_PBL_HPS_CFG			0x011f
-#define IRDMA_AE_AMP_MWBIND_WRONG_TYPE					0x0120
-#define IRDMA_AE_AMP_FASTREG_PBLE_MISMATCH				0x0121
-#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG				0x0132
-#define IRDMA_AE_UDA_XMIT_BAD_PD					0x0133
-#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT				0x0134
-#define IRDMA_AE_UDA_L4LEN_INVALID					0x0135
-#define IRDMA_AE_BAD_CLOSE						0x0201
-#define IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE				0x0202
-#define IRDMA_AE_CQ_OPERATION_ERROR					0x0203
-#define IRDMA_AE_RDMA_READ_WHILE_ORD_ZERO				0x0205
-#define IRDMA_AE_STAG_ZERO_INVALID					0x0206
-#define IRDMA_AE_IB_RREQ_AND_Q1_FULL					0x0207
-#define IRDMA_AE_IB_INVALID_REQUEST					0x0208
-#define IRDMA_AE_WQE_UNEXPECTED_OPCODE					0x020a
-#define IRDMA_AE_WQE_INVALID_PARAMETER					0x020b
-#define IRDMA_AE_WQE_INVALID_FRAG_DATA					0x020c
-#define IRDMA_AE_IB_REMOTE_ACCESS_ERROR					0x020d
-#define IRDMA_AE_IB_REMOTE_OP_ERROR					0x020e
-#define IRDMA_AE_WQE_LSMM_TOO_LONG					0x0220
-#define IRDMA_AE_INVALID_REQUEST					0x0223
-#define IRDMA_AE_DDP_INVALID_MSN_GAP_IN_MSN				0x0301
-#define IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER	0x0303
-#define IRDMA_AE_DDP_UBE_INVALID_DDP_VERSION				0x0304
-#define IRDMA_AE_DDP_UBE_INVALID_MO					0x0305
-#define IRDMA_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE		0x0306
-#define IRDMA_AE_DDP_UBE_INVALID_QN					0x0307
-#define IRDMA_AE_DDP_NO_L_BIT						0x0308
-#define IRDMA_AE_RDMAP_ROE_INVALID_RDMAP_VERSION			0x0311
-#define IRDMA_AE_RDMAP_ROE_UNEXPECTED_OPCODE				0x0312
-#define IRDMA_AE_ROE_INVALID_RDMA_READ_REQUEST				0x0313
-#define IRDMA_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP			0x0314
-#define IRDMA_AE_ROCE_RSP_LENGTH_ERROR					0x0316
-#define IRDMA_AE_ROCE_EMPTY_MCG						0x0380
-#define IRDMA_AE_ROCE_BAD_MC_IP_ADDR					0x0381
-#define IRDMA_AE_ROCE_BAD_MC_QPID					0x0382
-#define IRDMA_AE_MCG_QP_PROTOCOL_MISMATCH				0x0383
-#define IRDMA_AE_INVALID_ARP_ENTRY					0x0401
-#define IRDMA_AE_INVALID_TCP_OPTION_RCVD				0x0402
-#define IRDMA_AE_STALE_ARP_ENTRY					0x0403
-#define IRDMA_AE_INVALID_AH_ENTRY					0x0406
-#define IRDMA_AE_LLP_CLOSE_COMPLETE					0x0501
-#define IRDMA_AE_LLP_CONNECTION_RESET					0x0502
-#define IRDMA_AE_LLP_FIN_RECEIVED					0x0503
-#define IRDMA_AE_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH	0x0504
-#define IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR				0x0505
-#define IRDMA_AE_LLP_SEGMENT_TOO_SMALL					0x0507
-#define IRDMA_AE_LLP_SYN_RECEIVED					0x0508
-#define IRDMA_AE_LLP_TERMINATE_RECEIVED					0x0509
-#define IRDMA_AE_LLP_TOO_MANY_RETRIES					0x050a
-#define IRDMA_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES				0x050b
-#define IRDMA_AE_LLP_DOUBT_REACHABILITY					0x050c
-#define IRDMA_AE_LLP_CONNECTION_ESTABLISHED				0x050e
-#define IRDMA_AE_LLP_TOO_MANY_RNRS					0x050f
-#define IRDMA_AE_RESOURCE_EXHAUSTION					0x0520
-#define IRDMA_AE_RESET_SENT						0x0601
-#define IRDMA_AE_TERMINATE_SENT						0x0602
-#define IRDMA_AE_RESET_NOT_SENT						0x0603
-#define IRDMA_AE_LCE_QP_CATASTROPHIC					0x0700
-#define IRDMA_AE_LCE_FUNCTION_CATASTROPHIC				0x0701
-#define IRDMA_AE_LCE_CQ_CATASTROPHIC					0x0702
-#define IRDMA_AE_QP_SUSPEND_COMPLETE					0x0900
-
 #define FLD_LS_64(dev, val, field)	\
 	(((u64)(val) << (dev)->hw_shifts[field ## _S]) & (dev)->hw_masks[field ## _M])
 #define FLD_RS_64(dev, val, field)	\
@@ -393,9 +338,13 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_STATS_USE_INST BIT_ULL(61)
 #define IRDMA_CQPSQ_STATS_OP GENMASK_ULL(37, 32)
 #define IRDMA_CQPSQ_STATS_INST_INDEX GENMASK_ULL(6, 0)
-#define IRDMA_CQPSQ_STATS_HMC_FCN_INDEX GENMASK_ULL(5, 0)
+#define IRDMA_CQPSQ_STATS_HMC_FCN_INDEX GENMASK_ULL(15, 0)
 #define IRDMA_CQPSQ_WS_WQEVALID BIT_ULL(63)
-#define IRDMA_CQPSQ_WS_NODEOP GENMASK_ULL(53, 52)
+#define IRDMA_CQPSQ_WS_NODEOP GENMASK_ULL(55, 52)
+#define IRDMA_SD_MAX GENMASK_ULL(15, 0)
+#define IRDMA_MEM_MAX GENMASK_ULL(15, 0)
+#define IRDMA_QP_MEM_LOC GENMASK_ULL(47, 44)
+#define IRDMA_MR_MEM_LOC GENMASK_ULL(27, 24)
 
 #define IRDMA_CQPSQ_WS_ENABLENODE BIT_ULL(62)
 #define IRDMA_CQPSQ_WS_NODETYPE BIT_ULL(61)
@@ -404,16 +353,16 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_WS_VMVFTYPE GENMASK_ULL(55, 54)
 #define IRDMA_CQPSQ_WS_VMVFNUM GENMASK_ULL(51, 42)
 #define IRDMA_CQPSQ_WS_OP GENMASK_ULL(37, 32)
-#define IRDMA_CQPSQ_WS_PARENTID GENMASK_ULL(25, 16)
-#define IRDMA_CQPSQ_WS_NODEID GENMASK_ULL(9, 0)
-#define IRDMA_CQPSQ_WS_VSI GENMASK_ULL(57, 48)
+#define IRDMA_CQPSQ_WS_PARENTID GENMASK_ULL(29, 16)
+#define IRDMA_CQPSQ_WS_NODEID GENMASK_ULL(13, 0)
+#define IRDMA_CQPSQ_WS_VSI GENMASK_ULL(63, 48)
 #define IRDMA_CQPSQ_WS_WEIGHT GENMASK_ULL(38, 32)
 
 #define IRDMA_CQPSQ_UP_WQEVALID BIT_ULL(63)
 #define IRDMA_CQPSQ_UP_USEVLAN BIT_ULL(62)
 #define IRDMA_CQPSQ_UP_USEOVERRIDE BIT_ULL(61)
 #define IRDMA_CQPSQ_UP_OP GENMASK_ULL(37, 32)
-#define IRDMA_CQPSQ_UP_HMCFCNIDX GENMASK_ULL(5, 0)
+#define IRDMA_CQPSQ_UP_HMCFCNIDX GENMASK_ULL(15, 0)
 #define IRDMA_CQPSQ_UP_CNPOVERRIDE GENMASK_ULL(37, 32)
 #define IRDMA_CQPSQ_QUERY_RDMA_FEATURES_WQEVALID BIT_ULL(63)
 #define IRDMA_CQPSQ_QUERY_RDMA_FEATURES_BUF_LEN GENMASK_ULL(31, 0)
@@ -448,6 +397,16 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPHC_SVER GENMASK_ULL(31, 24)
 #define IRDMA_CQPHC_SQBASE GENMASK_ULL(63, 9)
 
+#define IRDMA_CQPHC_TIMESTAMP_OVERRIDE BIT_ULL(5)
+#define IRDMA_CQPHC_TS_SHIFT GENMASK_ULL(12, 8)
+#define IRDMA_CQPHC_EN_FINE_GRAINED_TIMERS BIT_ULL(0)
+
+#define IRDMA_CQPHC_OOISC_BLKSIZE GENMASK_ULL(63, 60)
+#define IRDMA_CQPHC_RRSP_BLKSIZE GENMASK_ULL(59, 56)
+#define IRDMA_CQPHC_Q1_BLKSIZE GENMASK_ULL(55, 52)
+#define IRDMA_CQPHC_XMIT_BLKSIZE GENMASK_ULL(51, 48)
+#define IRDMA_CQPHC_BLKSIZES_VALID BIT_ULL(4)
+
 #define IRDMA_CQPHC_QPCTX GENMASK_ULL(63, 0)
 #define IRDMA_QP_DBSA_HW_SQ_TAIL GENMASK_ULL(14, 0)
 #define IRDMA_CQ_DBSA_CQEIDX GENMASK_ULL(19, 0)
@@ -461,6 +420,8 @@ enum irdma_cqp_op_type {
 
 #define IRDMA_CCQ_OPRETVAL GENMASK_ULL(31, 0)
 
+#define IRDMA_CCQ_DEFINFO GENMASK_ULL(63, 32)
+
 #define IRDMA_CQ_MINERR GENMASK_ULL(15, 0)
 #define IRDMA_CQ_MAJERR GENMASK_ULL(31, 16)
 #define IRDMA_CQ_WQEIDX GENMASK_ULL(46, 32)
@@ -469,6 +430,7 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQ_ERROR BIT_ULL(55)
 #define IRDMA_CQ_SQ BIT_ULL(62)
 
+#define IRDMA_CQ_SRQ BIT_ULL(52)
 #define IRDMA_CQ_VALID BIT_ULL(63)
 #define IRDMA_CQ_IMMVALID BIT_ULL(62)
 #define IRDMA_CQ_UDSMACVALID BIT_ULL(61)
@@ -476,8 +438,6 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQ_UDSMAC GENMASK_ULL(47, 0)
 #define IRDMA_CQ_UDVLAN GENMASK_ULL(63, 48)
 
-#define IRDMA_CQ_IMMDATA_S 0
-#define IRDMA_CQ_IMMDATA_M (0xffffffffffffffffULL << IRDMA_CQ_IMMVALID_S)
 #define IRDMA_CQ_IMMDATALOW32 GENMASK_ULL(31, 0)
 #define IRDMA_CQ_IMMDATAUP32 GENMASK_ULL(63, 32)
 #define IRDMACQ_PAYLDLEN GENMASK_ULL(31, 0)
@@ -508,6 +468,17 @@ enum irdma_cqp_op_type {
 #define IRDMA_AEQE_Q2DATA GENMASK_ULL(62, 61)
 #define IRDMA_AEQE_VALID BIT_ULL(63)
 
+#define IRDMA_AEQE_Q2DATA_GEN_3 GENMASK_ULL(5, 4)
+#define IRDMA_AEQE_TCPSTATE_GEN_3 GENMASK_ULL(3, 0)
+#define IRDMA_AEQE_QPCQID_GEN_3 GENMASK_ULL(24, 0)
+#define IRDMA_AEQE_AECODE_GEN_3 GENMASK_ULL(61, 50)
+#define IRDMA_AEQE_OVERFLOW_GEN_3 BIT_ULL(62)
+#define IRDMA_AEQE_WQDESCIDX_GEN_3 GENMASK_ULL(49, 32)
+#define IRDMA_AEQE_IWSTATE_GEN_3 GENMASK_ULL(31, 29)
+#define IRDMA_AEQE_AESRC_GEN_3 GENMASK_ULL(28, 25)
+#define IRDMA_AEQE_CMPL_CTXT_S 6
+#define IRDMA_AEQE_CMPL_CTXT GENMASK_ULL(63, 6)
+
 #define IRDMA_UDA_QPSQ_NEXT_HDR GENMASK_ULL(23, 16)
 #define IRDMA_UDA_QPSQ_OPCODE GENMASK_ULL(37, 32)
 #define IRDMA_UDA_QPSQ_L4LEN GENMASK_ULL(45, 42)
@@ -530,11 +501,14 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_WQEVALID BIT_ULL(63)
 #define IRDMA_CQPSQ_TPHVAL GENMASK_ULL(7, 0)
 
-#define IRDMA_CQPSQ_VSIIDX GENMASK_ULL(17, 8)
+#define IRDMA_CQPSQ_VSIIDX GENMASK_ULL(23, 8)
 #define IRDMA_CQPSQ_TPHEN BIT_ULL(60)
 
 #define IRDMA_CQPSQ_PBUFADDR IRDMA_CQPHC_QPCTX
 
+#define IRDMA_CQPSQ_PASID GENMASK_ULL(51, 32)
+#define IRDMA_CQPSQ_PASID_VALID BIT_ULL(62)
+
 /* Create/Modify/Destroy QP */
 
 #define IRDMA_CQPSQ_QP_NEWMSS GENMASK_ULL(45, 32)
@@ -566,10 +540,30 @@ enum irdma_cqp_op_type {
 
 #define IRDMA_CQPSQ_QP_DBSHADOWADDR IRDMA_CQPHC_QPCTX
 
+#define IRDMA_CQPSQ_SRQ_RQSIZE GENMASK_ULL(3, 0)
+#define IRDMA_CQPSQ_SRQ_RQ_WQE_SIZE GENMASK_ULL(5, 4)
+#define IRDMA_CQPSQ_SRQ_SRQ_LIMIT GENMASK_ULL(43, 32)
+#define IRDMA_CQPSQ_SRQ_SRQCTX GENMASK_ULL(63, 6)
+#define IRDMA_CQPSQ_SRQ_PD_ID GENMASK_ULL(39, 16)
+#define IRDMA_CQPSQ_SRQ_SRQ_ID GENMASK_ULL(15, 0)
+#define IRDMA_CQPSQ_SRQ_OP GENMASK_ULL(37, 32)
+#define IRDMA_CQPSQ_SRQ_LEAF_PBL_SIZE GENMASK_ULL(45, 44)
+#define IRDMA_CQPSQ_SRQ_VIRTMAP BIT_ULL(47)
+#define IRDMA_CQPSQ_SRQ_TPH_EN BIT_ULL(60)
+#define IRDMA_CQPSQ_SRQ_ARM_LIMIT_EVENT BIT_ULL(61)
+#define IRDMA_CQPSQ_SRQ_FIRST_PM_PBL_IDX GENMASK_ULL(27, 0)
+#define IRDMA_CQPSQ_SRQ_TPH_VALUE GENMASK_ULL(7, 0)
+#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR_S 8
+#define IRDMA_CQPSQ_SRQ_PHYSICAL_BUFFER_ADDR GENMASK_ULL(63, 8)
+#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR_S 6
+#define IRDMA_CQPSQ_SRQ_DB_SHADOW_ADDR GENMASK_ULL(63, 6)
+
 #define IRDMA_CQPSQ_CQ_CQSIZE GENMASK_ULL(20, 0)
 #define IRDMA_CQPSQ_CQ_CQCTX GENMASK_ULL(62, 0)
 #define IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD GENMASK(17, 0)
 
+#define IRDMA_CQPSQ_CQ_CQID_HIGH GENMASK_ULL(52, 50)
+#define IRDMA_CQPSQ_CQ_CEQID_HIGH GENMASK_ULL(59, 54)
 #define IRDMA_CQPSQ_CQ_OP GENMASK_ULL(37, 32)
 #define IRDMA_CQPSQ_CQ_CQRESIZE BIT_ULL(43)
 #define IRDMA_CQPSQ_CQ_LPBLSIZE GENMASK_ULL(45, 44)
@@ -590,6 +584,7 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_STAG_MR BIT_ULL(43)
 #define IRDMA_CQPSQ_STAG_MWTYPE BIT_ULL(42)
 #define IRDMA_CQPSQ_STAG_MW1_BIND_DONT_VLDT_KEY BIT_ULL(58)
+#define IRDMA_CQPSQ_STAG_PDID_HI GENMASK_ULL(59, 54)
 
 #define IRDMA_CQPSQ_STAG_LPBLSIZE IRDMA_CQPSQ_CQ_LPBLSIZE
 #define IRDMA_CQPSQ_STAG_HPAGESIZE GENMASK_ULL(47, 46)
@@ -600,7 +595,8 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_STAG_USEPFRID BIT_ULL(61)
 
 #define IRDMA_CQPSQ_STAG_PBA IRDMA_CQPHC_QPCTX
-#define IRDMA_CQPSQ_STAG_HMCFNIDX GENMASK_ULL(5, 0)
+#define IRDMA_CQPSQ_STAG_HMCFNIDX GENMASK_ULL(15, 0)
+#define IRDMA_CQPSQ_STAG_REMOTE_ATOMIC_EN BIT_ULL(61)
 
 #define IRDMA_CQPSQ_STAG_FIRSTPMPBLIDX GENMASK_ULL(27, 0)
 #define IRDMA_CQPSQ_QUERYSTAG_IDX IRDMA_CQPSQ_STAG_IDX
@@ -628,11 +624,8 @@ enum irdma_cqp_op_type {
 /* Manage Push Page - MPP */
 #define IRDMA_INVALID_PUSH_PAGE_INDEX_GEN_1 0xffff
 #define IRDMA_INVALID_PUSH_PAGE_INDEX 0xffffffff
-
-#define IRDMA_CQPSQ_MPP_QS_HANDLE GENMASK_ULL(9, 0)
-#define IRDMA_CQPSQ_MPP_PPIDX GENMASK_ULL(9, 0)
+#define IRDMA_CQPSQ_MPP_PPIDX GENMASK_ULL(31, 0)
 #define IRDMA_CQPSQ_MPP_PPTYPE GENMASK_ULL(61, 60)
-
 #define IRDMA_CQPSQ_MPP_FREE_PAGE BIT_ULL(62)
 
 /* Upload Context - UCTX */
@@ -651,6 +644,8 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_CEQ_CEQSIZE GENMASK_ULL(21, 0)
 #define IRDMA_CQPSQ_CEQ_CEQID GENMASK_ULL(9, 0)
 
+#define IRDMA_CQPSQ_CEQ_CEQID_HIGH GENMASK_ULL(15, 10)
+
 #define IRDMA_CQPSQ_CEQ_LPBLSIZE IRDMA_CQPSQ_CQ_LPBLSIZE
 #define IRDMA_CQPSQ_CEQ_VMAP BIT_ULL(47)
 #define IRDMA_CQPSQ_CEQ_ITRNOEXPIRE BIT_ULL(46)
@@ -660,10 +655,10 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_AEQ_VMAP BIT_ULL(47)
 #define IRDMA_CQPSQ_AEQ_FIRSTPMPBLIDX GENMASK_ULL(27, 0)
 
-#define IRDMA_COMMIT_FPM_QPCNT GENMASK_ULL(18, 0)
-
+#define IRDMA_COMMIT_FPM_QPCNT GENMASK_ULL(20, 0)
 #define IRDMA_COMMIT_FPM_BASE_S 32
-#define IRDMA_CQPSQ_CFPM_HMCFNID GENMASK_ULL(5, 0)
+#define IRDMA_CQPSQ_CFPM_HMCFNID GENMASK_ULL(15, 0)
+
 #define IRDMA_CQPSQ_FWQE_AECODE GENMASK_ULL(15, 0)
 #define IRDMA_CQPSQ_FWQE_AESOURCE GENMASK_ULL(19, 16)
 #define IRDMA_CQPSQ_FWQE_RQMNERR GENMASK_ULL(15, 0)
@@ -675,6 +670,10 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_FWQE_USERFLCODE BIT_ULL(60)
 #define IRDMA_CQPSQ_FWQE_FLUSHSQ BIT_ULL(61)
 #define IRDMA_CQPSQ_FWQE_FLUSHRQ BIT_ULL(62)
+#define IRDMA_CQPSQ_FWQE_ERR_SQ_IDX_VALID BIT_ULL(42)
+#define IRDMA_CQPSQ_FWQE_ERR_SQ_IDX GENMASK_ULL(49, 32)
+#define IRDMA_CQPSQ_FWQE_ERR_RQ_IDX_VALID BIT_ULL(43)
+#define IRDMA_CQPSQ_FWQE_ERR_RQ_IDX GENMASK_ULL(46, 32)
 #define IRDMA_CQPSQ_MAPT_PORT GENMASK_ULL(15, 0)
 #define IRDMA_CQPSQ_MAPT_ADDPORT BIT_ULL(62)
 #define IRDMA_CQPSQ_UPESD_SDCMD GENMASK_ULL(31, 0)
@@ -693,9 +692,12 @@ enum irdma_cqp_op_type {
 #define IRDMA_CQPSQ_SUSPENDQP_QPID GENMASK_ULL(23, 0)
 #define IRDMA_CQPSQ_RESUMEQP_QSHANDLE GENMASK_ULL(31, 0)
 #define IRDMA_CQPSQ_RESUMEQP_QPID GENMASK(23, 0)
+#define IRDMA_MANAGE_RSRC_VER2 BIT_ULL(2)
 
 #define IRDMA_CQPSQ_MIN_STAG_INVALID 0x0001
 #define IRDMA_CQPSQ_MIN_SUSPEND_PND 0x0005
+#define IRDMA_CQPSQ_MIN_DEF_CMPL 0x0006
+#define IRDMA_CQPSQ_MIN_OOO_CMPL 0x0007
 
 #define IRDMA_CQPSQ_MAJ_NO_ERROR 0x0000
 #define IRDMA_CQPSQ_MAJ_OBJCACHE_ERROR 0xF000
@@ -712,6 +714,11 @@ enum irdma_cqp_op_type {
 #define IRDMAQPC_INSERTL2TAG2 BIT_ULL(11)
 #define IRDMAQPC_LIMIT GENMASK_ULL(13, 12)
 
+#define IRDMAQPC_USE_SRQ BIT_ULL(10)
+#define IRDMAQPC_SRQ_ID GENMASK_ULL(15, 0)
+#define IRDMAQPC_PASID GENMASK_ULL(19, 0)
+#define IRDMAQPC_PASID_VALID BIT_ULL(11)
+
 #define IRDMAQPC_ECN_EN BIT_ULL(14)
 #define IRDMAQPC_DROPOOOSEG BIT_ULL(15)
 #define IRDMAQPC_DUPACK_THRESH GENMASK_ULL(18, 16)
@@ -782,21 +789,31 @@ enum irdma_cqp_op_type {
 #define IRDMAQPC_CWNDROCE GENMASK_ULL(55, 32)
 #define IRDMAQPC_SNDWL1 GENMASK_ULL(31, 0)
 #define IRDMAQPC_SNDWL2 GENMASK_ULL(63, 32)
-#define IRDMAQPC_ERR_RQ_IDX GENMASK_ULL(45, 32)
+#define IRDMAQPC_MINRNR_TIMER GENMASK_ULL(4, 0)
+#define IRDMAQPC_ERR_RQ_IDX GENMASK_ULL(46, 32)
 #define IRDMAQPC_RTOMIN GENMASK_ULL(63, 57)
 #define IRDMAQPC_MAXSNDWND GENMASK_ULL(31, 0)
 #define IRDMAQPC_REXMIT_THRESH GENMASK_ULL(53, 48)
 #define IRDMAQPC_RNRNAK_THRESH GENMASK_ULL(56, 54)
-#define IRDMAQPC_TXCQNUM GENMASK_ULL(18, 0)
-#define IRDMAQPC_RXCQNUM GENMASK_ULL(50, 32)
+#define IRDMAQPC_TXCQNUM GENMASK_ULL(24, 0)
+#define IRDMAQPC_RXCQNUM GENMASK_ULL(56, 32)
 #define IRDMAQPC_STAT_INDEX GENMASK_ULL(6, 0)
 #define IRDMAQPC_Q2ADDR GENMASK_ULL(63, 8)
 #define IRDMAQPC_LASTBYTESENT GENMASK_ULL(7, 0)
 #define IRDMAQPC_MACADDRESS GENMASK_ULL(63, 16)
 #define IRDMAQPC_ORDSIZE GENMASK_ULL(7, 0)
 
+#define IRDMAQPC_LOCALACKTIMEOUT GENMASK_ULL(12, 8)
+#define IRDMAQPC_RNRNAK_TMR GENMASK_ULL(4, 0)
+#define IRDMAQPC_ORDSIZE_GEN3 GENMASK_ULL(10, 0)
+#define IRDMAQPC_REMOTE_ATOMIC_EN BIT_ULL(18)
+#define IRDMAQPC_STAT_INDEX_GEN3 GENMASK_ULL(47, 32)
+#define IRDMAQPC_PKT_LIMIT GENMASK_ULL(55, 48)
+
 #define IRDMAQPC_IRDSIZE GENMASK_ULL(18, 16)
 
+#define IRDMAQPC_IRDSIZE_GEN3 GENMASK_ULL(17, 14)
+
 #define IRDMAQPC_UDPRIVCQENABLE BIT_ULL(19)
 #define IRDMAQPC_WRRDRSPOK BIT_ULL(20)
 #define IRDMAQPC_RDOK BIT_ULL(21)
@@ -833,6 +850,7 @@ enum irdma_cqp_op_type {
 #define IRDMA_FEATURE_INFO GENMASK_ULL(47, 0)
 #define IRDMA_FEATURE_CNT GENMASK_ULL(47, 32)
 #define IRDMA_FEATURE_TYPE GENMASK_ULL(63, 48)
+#define IRDMA_FEATURE_RSRC_MAX GENMASK_ULL(31, 0)
 
 #define IRDMAQPSQ_OPCODE GENMASK_ULL(37, 32)
 #define IRDMAQPSQ_COPY_HOST_PBL BIT_ULL(43)
@@ -856,7 +874,7 @@ enum irdma_cqp_op_type {
 #define IRDMAQPSQ_REMSTAGINV GENMASK_ULL(31, 0)
 #define IRDMAQPSQ_DESTQKEY GENMASK_ULL(31, 0)
 #define IRDMAQPSQ_DESTQPN GENMASK_ULL(55, 32)
-#define IRDMAQPSQ_AHID GENMASK_ULL(16, 0)
+#define IRDMAQPSQ_AHID GENMASK_ULL(24, 0)
 #define IRDMAQPSQ_INLINEDATAFLAG BIT_ULL(57)
 
 #define IRDMA_INLINE_VALID_S 7
@@ -869,6 +887,9 @@ enum irdma_cqp_op_type {
 
 #define IRDMAQPSQ_REMTO IRDMA_CQPHC_QPCTX
 
+#define IRDMAQPSQ_STAG GENMASK_ULL(31, 0)
+#define IRDMAQPSQ_REMOTE_STAG GENMASK_ULL(31, 0)
+
 #define IRDMAQPSQ_STAGRIGHTS GENMASK_ULL(52, 48)
 #define IRDMAQPSQ_VABASEDTO BIT_ULL(53)
 #define IRDMAQPSQ_MEMWINDOWTYPE BIT_ULL(54)
@@ -879,6 +900,8 @@ enum irdma_cqp_op_type {
 
 #define IRDMAQPSQ_BASEVA_TO_FBO IRDMA_CQPHC_QPCTX
 
+#define IRDMAQPSQ_REMOTE_ATOMICS_EN BIT_ULL(55)
+
 #define IRDMAQPSQ_LOCSTAG GENMASK_ULL(31, 0)
 
 #define IRDMAQPSQ_STAGKEY GENMASK_ULL(7, 0)
@@ -903,11 +926,14 @@ enum irdma_cqp_op_type {
 #define IRDMAPFINT_OICR_PE_PUSH_M BIT(27)
 #define IRDMAPFINT_OICR_PE_CRITERR_M BIT(28)
 
-#define IRDMA_QUERY_FPM_MAX_QPS GENMASK_ULL(18, 0)
-#define IRDMA_QUERY_FPM_MAX_CQS GENMASK_ULL(19, 0)
+#define IRDMA_QUERY_FPM_LOC_MEM_PAGES GENMASK_ULL(63, 32)
+#define IRDMA_QUERY_FPM_MAX_QPS GENMASK_ULL(31, 0)
+#define IRDMA_QUERY_FPM_MAX_CQS GENMASK_ULL(31, 0)
 #define IRDMA_QUERY_FPM_FIRST_PE_SD_INDEX GENMASK_ULL(13, 0)
-#define IRDMA_QUERY_FPM_MAX_PE_SDS GENMASK_ULL(45, 32)
+#define IRDMA_QUERY_FPM_MAX_PE_SDS GENMASK_ULL(44, 32)
+#define IRDMA_QUERY_FPM_MAX_PE_SDS_GEN3 GENMASK_ULL(47, 32)
 #define IRDMA_QUERY_FPM_MAX_CEQS GENMASK_ULL(9, 0)
+#define IRDMA_QUERY_FPM_MAX_IRD GENMASK_ULL(53, 50)
 #define IRDMA_QUERY_FPM_XFBLOCKSIZE GENMASK_ULL(63, 32)
 #define IRDMA_QUERY_FPM_Q1BLOCKSIZE GENMASK_ULL(63, 32)
 #define IRDMA_QUERY_FPM_HTMULTIPLIER GENMASK_ULL(19, 16)
@@ -1103,7 +1129,7 @@ enum irdma_alignment {
 	IRDMA_CEQ_ALIGNMENT	    = 0x100,
 	IRDMA_CQ0_ALIGNMENT	    = 0x100,
 	IRDMA_SD_BUF_ALIGNMENT      = 0x80,
-	IRDMA_FEATURE_BUF_ALIGNMENT = 0x8,
+	IRDMA_FEATURE_BUF_ALIGNMENT = 0x10,
 };
 
 enum icrdma_protocol_used {
diff --git a/drivers/infiniband/hw/irdma/hmc.c b/drivers/infiniband/hw/irdma/hmc.c
index ac58088a8e41..da18add141da 100644
--- a/drivers/infiniband/hw/irdma/hmc.c
+++ b/drivers/infiniband/hw/irdma/hmc.c
@@ -5,6 +5,7 @@
 #include "defs.h"
 #include "type.h"
 #include "protos.h"
+#include "virtchnl.h"
 
 /**
  * irdma_find_sd_index_limit - finds segment descriptor index limit
@@ -228,6 +229,10 @@ int irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev,
 	bool pd_error = false;
 	int ret_code = 0;
 
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3 &&
+	    dev->hmc_info->hmc_obj[info->rsrc_type].mem_loc == IRDMA_LOC_MEM)
+		return 0;
+
 	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt)
 		return -EINVAL;
 
@@ -330,7 +335,7 @@ static int irdma_finish_del_sd_reg(struct irdma_sc_dev *dev,
 	u32 i, sd_idx;
 	struct irdma_dma_mem *mem;
 
-	if (!reset)
+	if (dev->privileged && !reset)
 		ret_code = irdma_hmc_sd_grp(dev, info->hmc_info,
 					    info->hmc_info->sd_indexes[0],
 					    info->del_sd_cnt, false);
@@ -376,6 +381,9 @@ int irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev,
 	u32 i, j;
 	int ret_code = 0;
 
+	if (dev->hmc_info->hmc_obj[info->rsrc_type].mem_loc == IRDMA_LOC_MEM)
+		return 0;
+
 	if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) {
 		ibdev_dbg(to_ibdev(dev),
 			  "HMC: error start_idx[%04d]  >= [type %04d].cnt[%04d]\n",
@@ -589,7 +597,10 @@ int irdma_add_pd_table_entry(struct irdma_sc_dev *dev,
 		pd_entry->sd_index = sd_idx;
 		pd_entry->valid = true;
 		pd_table->use_cnt++;
-		irdma_invalidate_pf_hmc_pd(dev, sd_idx, rel_pd_idx);
+
+		if (hmc_info->hmc_fn_id < dev->hw_attrs.first_hw_vf_fpm_id &&
+		    dev->privileged)
+			irdma_invalidate_pf_hmc_pd(dev, sd_idx, rel_pd_idx);
 	}
 	pd_entry->bp.use_cnt++;
 
@@ -640,7 +651,8 @@ int irdma_remove_pd_bp(struct irdma_sc_dev *dev,
 	pd_addr = pd_table->pd_page_addr.va;
 	pd_addr += rel_pd_idx;
 	memset(pd_addr, 0, sizeof(u64));
-	irdma_invalidate_pf_hmc_pd(dev, sd_idx, idx);
+	if (dev->privileged && dev->hmc_fn_id == hmc_info->hmc_fn_id)
+		irdma_invalidate_pf_hmc_pd(dev, sd_idx, idx);
 
 	if (!pd_entry->rsrc_pg) {
 		mem = &pd_entry->bp.addr;
diff --git a/drivers/infiniband/hw/irdma/hmc.h b/drivers/infiniband/hw/irdma/hmc.h
index 415f9e23bbf6..257a5d22aa96 100644
--- a/drivers/infiniband/hw/irdma/hmc.h
+++ b/drivers/infiniband/hw/irdma/hmc.h
@@ -16,11 +16,21 @@
 #define IRDMA_HMC_PD_BP_BUF_ALIGNMENT		4096
 #define IRDMA_FIRST_VF_FPM_ID			8
 #define FPM_MULTIPLIER				1024
+#define IRDMA_OBJ_LOC_MEM_BIT			0x4
+#define IRDMA_XF_MULTIPLIER			16
+#define IRDMA_RRF_MULTIPLIER			8
+#define IRDMA_MIN_PBLE_PAGES			3
+#define IRDMA_HMC_PAGE_SIZE			2097152
+#define IRDMA_MIN_MR_PER_QP			4
+#define IRDMA_MIN_QP_CNT			64
+#define IRDMA_FSIAV_CNT_MAX			1048576
+#define IRDMA_MIN_IRD				8
+#define IRDMA_HMC_MIN_RRF			16
 
 enum irdma_hmc_rsrc_type {
 	IRDMA_HMC_IW_QP		 = 0,
 	IRDMA_HMC_IW_CQ		 = 1,
-	IRDMA_HMC_IW_RESERVED	 = 2,
+	IRDMA_HMC_IW_SRQ	 = 2,
 	IRDMA_HMC_IW_HTE	 = 3,
 	IRDMA_HMC_IW_ARP	 = 4,
 	IRDMA_HMC_IW_APBVT_ENTRY = 5,
@@ -48,11 +58,17 @@ enum irdma_sd_entry_type {
 	IRDMA_SD_TYPE_DIRECT  = 2,
 };
 
+enum irdma_hmc_obj_mem {
+	IRDMA_HOST_MEM = 0,
+	IRDMA_LOC_MEM  = 1,
+};
+
 struct irdma_hmc_obj_info {
 	u64 base;
 	u32 max_cnt;
 	u32 cnt;
 	u64 size;
+	enum irdma_hmc_obj_mem mem_loc;
 };
 
 struct irdma_hmc_bp {
@@ -117,6 +133,7 @@ struct irdma_update_sds_info {
 struct irdma_ccq_cqe_info;
 struct irdma_hmc_fcn_info {
 	u32 vf_id;
+	u8 protocol_used;
 	u8 free_fcn;
 };
 
diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c
index 69ce1862eabe..7bad0e38786a 100644
--- a/drivers/infiniband/hw/irdma/hw.c
+++ b/drivers/infiniband/hw/irdma/hw.c
@@ -33,6 +33,7 @@ static struct irdma_rsrc_limits rsrc_limits_table[] = {
 static enum irdma_hmc_rsrc_type iw_hmc_obj_types[] = {
 	IRDMA_HMC_IW_QP,
 	IRDMA_HMC_IW_CQ,
+	IRDMA_HMC_IW_SRQ,
 	IRDMA_HMC_IW_HTE,
 	IRDMA_HMC_IW_ARP,
 	IRDMA_HMC_IW_APBVT_ENTRY,
@@ -134,75 +135,68 @@ static void irdma_process_ceq(struct irdma_pci_f *rf, struct irdma_ceq *ceq)
 static void irdma_set_flush_fields(struct irdma_sc_qp *qp,
 				   struct irdma_aeqe_info *info)
 {
+	struct qp_err_code qp_err;
+
 	qp->sq_flush_code = info->sq;
 	qp->rq_flush_code = info->rq;
-	qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-
-	switch (info->ae_id) {
-	case IRDMA_AE_AMP_BOUNDS_VIOLATION:
-	case IRDMA_AE_AMP_INVALID_STAG:
-	case IRDMA_AE_AMP_RIGHTS_VIOLATION:
-	case IRDMA_AE_AMP_UNALLOCATED_STAG:
-	case IRDMA_AE_AMP_BAD_PD:
-	case IRDMA_AE_AMP_BAD_QP:
-	case IRDMA_AE_AMP_BAD_STAG_KEY:
-	case IRDMA_AE_AMP_BAD_STAG_INDEX:
-	case IRDMA_AE_AMP_TO_WRAP:
-	case IRDMA_AE_PRIV_OPERATION_DENIED:
-		qp->flush_code = FLUSH_PROT_ERR;
-		qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR;
-		break;
-	case IRDMA_AE_UDA_XMIT_BAD_PD:
-	case IRDMA_AE_WQE_UNEXPECTED_OPCODE:
-		qp->flush_code = FLUSH_LOC_QP_OP_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
-	case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG:
-	case IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT:
-	case IRDMA_AE_UDA_L4LEN_INVALID:
-	case IRDMA_AE_DDP_UBE_INVALID_MO:
-	case IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
-		qp->flush_code = FLUSH_LOC_LEN_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
-	case IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
-	case IRDMA_AE_IB_REMOTE_ACCESS_ERROR:
-		qp->flush_code = FLUSH_REM_ACCESS_ERR;
-		qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR;
-		break;
-	case IRDMA_AE_LLP_SEGMENT_TOO_SMALL:
-	case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR:
-	case IRDMA_AE_ROCE_RSP_LENGTH_ERROR:
-	case IRDMA_AE_IB_REMOTE_OP_ERROR:
-		qp->flush_code = FLUSH_REM_OP_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
-	case IRDMA_AE_LCE_QP_CATASTROPHIC:
-		qp->flush_code = FLUSH_FATAL_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
-	case IRDMA_AE_IB_RREQ_AND_Q1_FULL:
-		qp->flush_code = FLUSH_GENERAL_ERR;
-		break;
-	case IRDMA_AE_LLP_TOO_MANY_RETRIES:
-		qp->flush_code = FLUSH_RETRY_EXC_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
-	case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS:
-	case IRDMA_AE_AMP_MWBIND_BIND_DISABLED:
-	case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS:
-	case IRDMA_AE_AMP_MWBIND_VALID_STAG:
-		qp->flush_code = FLUSH_MW_BIND_ERR;
-		qp->event_type = IRDMA_QP_EVENT_ACCESS_ERR;
-		break;
-	case IRDMA_AE_IB_INVALID_REQUEST:
-		qp->flush_code = FLUSH_REM_INV_REQ_ERR;
-		qp->event_type = IRDMA_QP_EVENT_REQ_ERR;
-		break;
-	default:
-		qp->flush_code = FLUSH_GENERAL_ERR;
-		qp->event_type = IRDMA_QP_EVENT_CATASTROPHIC;
-		break;
+	if (qp->qp_uk.uk_attrs->hw_rev >= IRDMA_GEN_3) {
+		if (info->sq) {
+			qp->err_sq_idx_valid = true;
+			qp->err_sq_idx = info->wqe_idx;
+		}
+		if (info->rq) {
+			qp->err_rq_idx_valid = true;
+			qp->err_rq_idx = info->wqe_idx;
+		}
+	}
+
+	qp_err = irdma_ae_to_qp_err_code(info->ae_id);
+	qp->flush_code = qp_err.flush_code;
+	qp->event_type = qp_err.event_type;
+}
+
+/**
+ * irdma_complete_cqp_request - perform post-completion cleanup
+ * @cqp: device CQP
+ * @cqp_request: CQP request
+ *
+ * Mark CQP request as done, wake up waiting thread or invoke
+ * callback function and release/free CQP request.
+ */
+static void irdma_complete_cqp_request(struct irdma_cqp *cqp,
+				       struct irdma_cqp_request *cqp_request)
+{
+	if (cqp_request->waiting) {
+		WRITE_ONCE(cqp_request->request_done, true);
+		wake_up(&cqp_request->waitq);
+	} else if (cqp_request->callback_fcn) {
+		cqp_request->callback_fcn(cqp_request);
+	}
+	irdma_put_cqp_request(cqp, cqp_request);
+}
+
+/**
+ * irdma_process_ae_def_cmpl - handle IRDMA_AE_CQP_DEFERRED_COMPLETE event
+ * @rf: RDMA PCI function
+ * @info: AEQ entry info
+ */
+static void irdma_process_ae_def_cmpl(struct irdma_pci_f *rf,
+				      struct irdma_aeqe_info *info)
+{
+	u32 sw_def_info;
+	u64 scratch;
+
+	irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq);
+
+	irdma_sc_cqp_def_cmpl_ae_handler(&rf->sc_dev, info, true,
+					 &scratch, &sw_def_info);
+	while (scratch) {
+		struct irdma_cqp_request *cqp_request =
+			(struct irdma_cqp_request *)(uintptr_t)scratch;
+
+		irdma_complete_cqp_request(&rf->cqp, cqp_request);
+		irdma_sc_cqp_def_cmpl_ae_handler(&rf->sc_dev, info, false,
+						 &scratch, &sw_def_info);
 	}
 }
 
@@ -223,6 +217,7 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 	struct irdma_sc_qp *qp = NULL;
 	struct irdma_qp_host_ctx_info *ctx_info = NULL;
 	struct irdma_device *iwdev = rf->iwdev;
+	struct irdma_sc_srq *srq;
 	unsigned long flags;
 
 	u32 aeqcnt = 0;
@@ -236,6 +231,13 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 		if (ret)
 			break;
 
+		if (info->aeqe_overflow) {
+			ibdev_err(&iwdev->ibdev, "AEQ has overflowed\n");
+			rf->reset = true;
+			rf->gen_ops.request_reset(rf);
+			return;
+		}
+
 		aeqcnt++;
 		ibdev_dbg(&iwdev->ibdev,
 			  "AEQ: ae_id = 0x%x bool qp=%d qp_id = %d tcp_state=%d iwarp_state=%d ae_src=%d\n",
@@ -266,9 +268,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 			if (info->ae_id != IRDMA_AE_QP_SUSPEND_COMPLETE)
 				iwqp->last_aeq = info->ae_id;
 			spin_unlock_irqrestore(&iwqp->lock, flags);
-			ctx_info = &iwqp->ctx_info;
+		} else if (info->srq) {
+			if (info->ae_id != IRDMA_AE_SRQ_LIMIT)
+				continue;
 		} else {
-			if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR)
+			if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR &&
+			    info->ae_id != IRDMA_AE_CQP_DEFERRED_COMPLETE)
 				continue;
 		}
 
@@ -363,6 +368,18 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 			}
 			irdma_cq_rem_ref(&iwcq->ibcq);
 			break;
+		case IRDMA_AE_SRQ_LIMIT:
+			srq = (struct irdma_sc_srq *)(uintptr_t)info->compl_ctx;
+			irdma_srq_event(srq);
+			break;
+		case IRDMA_AE_SRQ_CATASTROPHIC_ERROR:
+			break;
+		case IRDMA_AE_CQP_DEFERRED_COMPLETE:
+			/* Remove completed CQP requests from pending list
+			 * and notify about those CQP ops completion.
+			 */
+			irdma_process_ae_def_cmpl(rf, info);
+			break;
 		case IRDMA_AE_RESET_NOT_SENT:
 		case IRDMA_AE_LLP_DOUBT_REACHABILITY:
 		case IRDMA_AE_RESOURCE_EXHAUSTION:
@@ -389,13 +406,18 @@ static void irdma_process_aeq(struct irdma_pci_f *rf)
 		case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC:
 		case IRDMA_AE_LLP_TOO_MANY_RNRS:
 		case IRDMA_AE_LCE_CQ_CATASTROPHIC:
+		case IRDMA_AE_REMOTE_QP_CATASTROPHIC:
+		case IRDMA_AE_LOCAL_QP_CATASTROPHIC:
+		case IRDMA_AE_RCE_QP_CATASTROPHIC:
 		case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG:
 		default:
 			ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n",
 				  info->ae_id, info->qp, info->qp_cq_id, info->ae_src);
-			if (rdma_protocol_roce(&iwdev->ibdev, 1)) {
-				ctx_info->roce_info->err_rq_idx_valid = info->rq;
-				if (info->rq) {
+			ctx_info = &iwqp->ctx_info;
+			if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) {
+				ctx_info->roce_info->err_rq_idx_valid =
+					ctx_info->srq_valid ? false : info->err_rq_idx_valid;
+				if (ctx_info->roce_info->err_rq_idx_valid) {
 					ctx_info->roce_info->err_rq_idx = info->wqe_idx;
 					irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va,
 								ctx_info);
@@ -599,6 +621,8 @@ static void irdma_destroy_cqp(struct irdma_pci_f *rf)
 	dma_free_coherent(dev->hw->device, cqp->sq.size, cqp->sq.va,
 			  cqp->sq.pa);
 	cqp->sq.va = NULL;
+	kfree(cqp->oop_op_array);
+	cqp->oop_op_array = NULL;
 	kfree(cqp->scratch_array);
 	cqp->scratch_array = NULL;
 	kfree(cqp->cqp_requests);
@@ -631,7 +655,9 @@ static void irdma_destroy_aeq(struct irdma_pci_f *rf)
 	int status = -EBUSY;
 
 	if (!rf->msix_shared) {
-		rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, rf->iw_msixtbl->idx, false);
+		if (rf->sc_dev.privileged)
+			rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev,
+							  rf->iw_msixtbl->idx, false);
 		irdma_destroy_irq(rf, rf->iw_msixtbl, rf);
 	}
 	if (rf->reset)
@@ -697,9 +723,10 @@ static void irdma_del_ceq_0(struct irdma_pci_f *rf)
 
 	if (rf->msix_shared) {
 		msix_vec = &rf->iw_msixtbl[0];
-		rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev,
-						  msix_vec->ceq_id,
-						  msix_vec->idx, false);
+		if (rf->sc_dev.privileged)
+			rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev,
+							  msix_vec->ceq_id,
+							  msix_vec->idx, false);
 		irdma_destroy_irq(rf, msix_vec, rf);
 	} else {
 		msix_vec = &rf->iw_msixtbl[1];
@@ -730,8 +757,10 @@ static void irdma_del_ceqs(struct irdma_pci_f *rf)
 		msix_vec = &rf->iw_msixtbl[2];
 
 	for (i = 1; i < rf->ceqs_count; i++, msix_vec++, iwceq++) {
-		rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, msix_vec->ceq_id,
-						  msix_vec->idx, false);
+		if (rf->sc_dev.privileged)
+			rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev,
+							  msix_vec->ceq_id,
+							  msix_vec->idx, false);
 		irdma_destroy_irq(rf, msix_vec, iwceq);
 		irdma_cqp_ceq_cmd(&rf->sc_dev, &iwceq->sc_ceq,
 				  IRDMA_OP_CEQ_DESTROY);
@@ -942,6 +971,13 @@ static int irdma_create_cqp(struct irdma_pci_f *rf)
 		goto err_scratch;
 	}
 
+	cqp->oop_op_array = kcalloc(sqsize, sizeof(*cqp->oop_op_array),
+				    GFP_KERNEL);
+	if (!cqp->oop_op_array) {
+		status = -ENOMEM;
+		goto err_oop;
+	}
+	cqp_init_info.ooo_op_array = cqp->oop_op_array;
 	dev->cqp = &cqp->sc_cqp;
 	dev->cqp->dev = dev;
 	cqp->sq.size = ALIGN(sizeof(struct irdma_cqp_sq_wqe) * sqsize,
@@ -978,6 +1014,10 @@ static int irdma_create_cqp(struct irdma_pci_f *rf)
 	case IRDMA_GEN_2:
 		cqp_init_info.hw_maj_ver = IRDMA_CQPHC_HW_MAJVER_GEN_2;
 		break;
+	case IRDMA_GEN_3:
+		cqp_init_info.hw_maj_ver = IRDMA_CQPHC_HW_MAJVER_GEN_3;
+		cqp_init_info.ts_override = 1;
+		break;
 	}
 	status = irdma_sc_cqp_init(dev->cqp, &cqp_init_info);
 	if (status) {
@@ -1012,6 +1052,9 @@ err_ctx:
 			  cqp->sq.va, cqp->sq.pa);
 	cqp->sq.va = NULL;
 err_sq:
+	kfree(cqp->oop_op_array);
+	cqp->oop_op_array = NULL;
+err_oop:
 	kfree(cqp->scratch_array);
 	cqp->scratch_array = NULL;
 err_scratch:
@@ -1033,13 +1076,15 @@ static int irdma_create_ccq(struct irdma_pci_f *rf)
 	struct irdma_sc_dev *dev = &rf->sc_dev;
 	struct irdma_ccq_init_info info = {};
 	struct irdma_ccq *ccq = &rf->ccq;
+	int ccq_size;
 	int status;
 
 	dev->ccq = &ccq->sc_cq;
 	dev->ccq->dev = dev;
 	info.dev = dev;
+	ccq_size = (rf->rdma_ver >= IRDMA_GEN_3) ? IW_GEN_3_CCQ_SIZE : IW_CCQ_SIZE;
 	ccq->shadow_area.size = sizeof(struct irdma_cq_shadow_area);
-	ccq->mem_cq.size = ALIGN(sizeof(struct irdma_cqe) * IW_CCQ_SIZE,
+	ccq->mem_cq.size = ALIGN(sizeof(struct irdma_cqe) * ccq_size,
 				 IRDMA_CQ0_ALIGNMENT);
 	ccq->mem_cq.va = dma_alloc_coherent(dev->hw->device, ccq->mem_cq.size,
 					    &ccq->mem_cq.pa, GFP_KERNEL);
@@ -1056,7 +1101,7 @@ static int irdma_create_ccq(struct irdma_pci_f *rf)
 	/* populate the ccq init info */
 	info.cq_base = ccq->mem_cq.va;
 	info.cq_pa = ccq->mem_cq.pa;
-	info.num_elem = IW_CCQ_SIZE;
+	info.num_elem = ccq_size;
 	info.shadow_area = ccq->shadow_area.va;
 	info.shadow_area_pa = ccq->shadow_area.pa;
 	info.ceqe_mask = false;
@@ -1140,9 +1185,13 @@ static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq,
 	}
 
 	msix_vec->ceq_id = ceq_id;
-	rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, ceq_id, msix_vec->idx, true);
-
-	return 0;
+	if (rf->sc_dev.privileged)
+		rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, ceq_id,
+						  msix_vec->idx, true);
+	else
+		status = irdma_vchnl_req_ceq_vec_map(&rf->sc_dev, ceq_id,
+						     msix_vec->idx);
+	return status;
 }
 
 /**
@@ -1155,7 +1204,7 @@ static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq,
 static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf)
 {
 	struct irdma_msix_vector *msix_vec = rf->iw_msixtbl;
-	u32 ret = 0;
+	int ret = 0;
 
 	if (!rf->msix_shared) {
 		snprintf(msix_vec->name, sizeof(msix_vec->name) - 1,
@@ -1166,12 +1215,16 @@ static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf)
 	}
 	if (ret) {
 		ibdev_dbg(&rf->iwdev->ibdev, "ERR: aeq irq config fail\n");
-		return -EINVAL;
+		return ret;
 	}
 
-	rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, msix_vec->idx, true);
+	if (rf->sc_dev.privileged)
+		rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, msix_vec->idx,
+						  true);
+	else
+		ret = irdma_vchnl_req_aeq_vec_map(&rf->sc_dev, msix_vec->idx);
 
-	return 0;
+	return ret;
 }
 
 /**
@@ -1179,13 +1232,13 @@ static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf)
  * @rf: RDMA PCI function
  * @iwceq: pointer to the ceq resources to be created
  * @ceq_id: the id number of the iwceq
- * @vsi: SC vsi struct
+ * @vsi_idx: vsi idx
  *
  * Return 0, if the ceq and the resources associated with it
  * are successfully created, otherwise return error
  */
 static int irdma_create_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq,
-			    u32 ceq_id, struct irdma_sc_vsi *vsi)
+			    u32 ceq_id, u16 vsi_idx)
 {
 	int status;
 	struct irdma_ceq_init_info info = {};
@@ -1209,7 +1262,7 @@ static int irdma_create_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq,
 	info.elem_cnt = ceq_size;
 	iwceq->sc_ceq.ceq_id = ceq_id;
 	info.dev = dev;
-	info.vsi = vsi;
+	info.vsi_idx = vsi_idx;
 	status = irdma_sc_ceq_init(&iwceq->sc_ceq, &info);
 	if (!status) {
 		if (dev->ceq_valid)
@@ -1252,7 +1305,7 @@ static int irdma_setup_ceq_0(struct irdma_pci_f *rf)
 	}
 
 	iwceq = &rf->ceqlist[0];
-	status = irdma_create_ceq(rf, iwceq, 0, &rf->default_vsi);
+	status = irdma_create_ceq(rf, iwceq, 0, rf->default_vsi.vsi_idx);
 	if (status) {
 		ibdev_dbg(&rf->iwdev->ibdev, "ERR: create ceq status = %d\n",
 			  status);
@@ -1287,13 +1340,13 @@ exit:
 /**
  * irdma_setup_ceqs - manage the device ceq's and their interrupt resources
  * @rf: RDMA PCI function
- * @vsi: VSI structure for this CEQ
+ * @vsi_idx: vsi_idx for this CEQ
  *
  * Allocate a list for all device completion event queues
  * Create the ceq's and configure their msix interrupt vectors
  * Return 0, if ceqs are successfully set up, otherwise return error
  */
-static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
+static int irdma_setup_ceqs(struct irdma_pci_f *rf, u16 vsi_idx)
 {
 	u32 i;
 	u32 ceq_id;
@@ -1306,7 +1359,7 @@ static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
 	i = (rf->msix_shared) ? 1 : 2;
 	for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) {
 		iwceq = &rf->ceqlist[ceq_id];
-		status = irdma_create_ceq(rf, iwceq, ceq_id, vsi);
+		status = irdma_create_ceq(rf, iwceq, ceq_id, vsi_idx);
 		if (status) {
 			ibdev_dbg(&rf->iwdev->ibdev,
 				  "ERR: create ceq status = %d\n", status);
@@ -1387,7 +1440,10 @@ static int irdma_create_aeq(struct irdma_pci_f *rf)
 	aeq_size = multiplier * hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt +
 		   hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt;
 	aeq_size = min(aeq_size, dev->hw_attrs.max_hw_aeq_size);
-
+	/* GEN_3 does not support virtual AEQ. Cap at max Kernel alloc size */
+	if (rf->rdma_ver == IRDMA_GEN_3)
+		aeq_size = min(aeq_size, (u32)((PAGE_SIZE << MAX_PAGE_ORDER) /
+			       sizeof(struct irdma_sc_aeqe)));
 	aeq->mem.size = ALIGN(sizeof(struct irdma_sc_aeqe) * aeq_size,
 			      IRDMA_AEQ_ALIGNMENT);
 	aeq->mem.va = dma_alloc_coherent(dev->hw->device, aeq->mem.size,
@@ -1395,6 +1451,8 @@ static int irdma_create_aeq(struct irdma_pci_f *rf)
 					 GFP_KERNEL | __GFP_NOWARN);
 	if (aeq->mem.va)
 		goto skip_virt_aeq;
+	else if (rf->rdma_ver == IRDMA_GEN_3)
+		return -ENOMEM;
 
 	/* physically mapped aeq failed. setup virtual aeq */
 	status = irdma_create_virt_aeq(rf, aeq_size);
@@ -1569,6 +1627,8 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf)
 {
 	struct irdma_sc_dev *dev = &rf->sc_dev;
 
+	if (!rf->sc_dev.privileged)
+		irdma_vchnl_req_put_hmc_fcn(&rf->sc_dev);
 	kfree(dev->hmc_info->sd_table.sd_entry);
 	dev->hmc_info->sd_table.sd_entry = NULL;
 	vfree(rf->mem_rsrc);
@@ -1635,6 +1695,7 @@ static int irdma_initialize_dev(struct irdma_pci_f *rf)
 
 	info.bar0 = rf->hw.hw_addr;
 	info.hmc_fn_id = rf->pf_id;
+	info.protocol_used = rf->protocol_used;
 	info.hw = &rf->hw;
 	status = irdma_sc_dev_init(rf->rdma_ver, &rf->sc_dev, &info);
 	if (status)
@@ -1665,9 +1726,6 @@ void irdma_rt_deinit_hw(struct irdma_device *iwdev)
 			irdma_del_local_mac_entry(iwdev->rf,
 						  (u8)iwdev->mac_ip_table_idx);
 		fallthrough;
-	case AEQ_CREATED:
-	case PBLE_CHUNK_MEM:
-	case CEQS_CREATED:
 	case IEQ_CREATED:
 		if (!iwdev->roce_mode)
 			irdma_puda_dele_rsrc(&iwdev->vsi, IRDMA_PUDA_RSRC_TYPE_IEQ,
@@ -1740,7 +1798,9 @@ static void irdma_get_used_rsrc(struct irdma_device *iwdev)
 	iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps,
 						 iwdev->rf->max_qp);
 	iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs,
-						 iwdev->rf->max_cq);
+						  iwdev->rf->max_cq);
+	iwdev->rf->used_srqs = find_first_zero_bit(iwdev->rf->allocated_srqs,
+						   iwdev->rf->max_srq);
 	iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs,
 						 iwdev->rf->max_mr);
 }
@@ -1750,13 +1810,17 @@ void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf)
 	enum init_completion_state state = rf->init_state;
 
 	rf->init_state = INVALID_STATE;
-	if (rf->rsrc_created) {
+
+	switch (state) {
+	case AEQ_CREATED:
 		irdma_destroy_aeq(rf);
+		fallthrough;
+	case PBLE_CHUNK_MEM:
 		irdma_destroy_pble_prm(rf->pble_rsrc);
+		fallthrough;
+	case CEQS_CREATED:
 		irdma_del_ceqs(rf);
-		rf->rsrc_created = false;
-	}
-	switch (state) {
+		fallthrough;
 	case CEQ0_CREATED:
 		irdma_del_ceq_0(rf);
 		fallthrough;
@@ -1835,32 +1899,6 @@ int irdma_rt_init_hw(struct irdma_device *iwdev,
 				break;
 			iwdev->init_state = IEQ_CREATED;
 		}
-		if (!rf->rsrc_created) {
-			status = irdma_setup_ceqs(rf, &iwdev->vsi);
-			if (status)
-				break;
-
-			iwdev->init_state = CEQS_CREATED;
-
-			status = irdma_hmc_init_pble(&rf->sc_dev,
-						     rf->pble_rsrc);
-			if (status) {
-				irdma_del_ceqs(rf);
-				break;
-			}
-
-			iwdev->init_state = PBLE_CHUNK_MEM;
-
-			status = irdma_setup_aeq(rf);
-			if (status) {
-				irdma_destroy_pble_prm(rf->pble_rsrc);
-				irdma_del_ceqs(rf);
-				break;
-			}
-			iwdev->init_state = AEQ_CREATED;
-			rf->rsrc_created = true;
-		}
-
 		if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1)
 			irdma_alloc_set_mac(iwdev);
 		irdma_add_ip(iwdev);
@@ -1907,6 +1945,13 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf)
 			break;
 		rf->init_state = CQP_CREATED;
 
+		dev->feature_info[IRDMA_FEATURE_FW_INFO] = IRDMA_FW_VER_DEFAULT;
+		if (rf->rdma_ver != IRDMA_GEN_1) {
+			status = irdma_get_rdma_features(dev);
+			if (status)
+				break;
+		}
+
 		status = irdma_hmc_setup(rf);
 		if (status)
 			break;
@@ -1922,13 +1967,6 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf)
 			break;
 		rf->init_state = CCQ_CREATED;
 
-		dev->feature_info[IRDMA_FEATURE_FW_INFO] = IRDMA_FW_VER_DEFAULT;
-		if (rf->rdma_ver != IRDMA_GEN_1) {
-			status = irdma_get_rdma_features(dev);
-			if (status)
-				break;
-		}
-
 		status = irdma_setup_ceq_0(rf);
 		if (status)
 			break;
@@ -1942,6 +1980,25 @@ int irdma_ctrl_init_hw(struct irdma_pci_f *rf)
 		}
 		INIT_WORK(&rf->cqp_cmpl_work, cqp_compl_worker);
 		irdma_sc_ccq_arm(dev->ccq);
+
+		status = irdma_setup_ceqs(rf, rf->iwdev ? rf->iwdev->vsi_num : 0);
+		if (status)
+			break;
+
+		rf->init_state = CEQS_CREATED;
+
+		status = irdma_hmc_init_pble(&rf->sc_dev,
+					     rf->pble_rsrc);
+		if (status)
+			break;
+
+		rf->init_state = PBLE_CHUNK_MEM;
+
+		status = irdma_setup_aeq(rf);
+		if (status)
+			break;
+		rf->init_state = AEQ_CREATED;
+
 		return 0;
 	} while (0);
 
@@ -1960,7 +2017,8 @@ static void irdma_set_hw_rsrc(struct irdma_pci_f *rf)
 	rf->allocated_qps = (void *)(rf->mem_rsrc +
 		   (sizeof(struct irdma_arp_entry) * rf->arp_table_size));
 	rf->allocated_cqs = &rf->allocated_qps[BITS_TO_LONGS(rf->max_qp)];
-	rf->allocated_mrs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)];
+	rf->allocated_srqs = &rf->allocated_cqs[BITS_TO_LONGS(rf->max_cq)];
+	rf->allocated_mrs = &rf->allocated_srqs[BITS_TO_LONGS(rf->max_srq)];
 	rf->allocated_pds = &rf->allocated_mrs[BITS_TO_LONGS(rf->max_mr)];
 	rf->allocated_ahs = &rf->allocated_pds[BITS_TO_LONGS(rf->max_pd)];
 	rf->allocated_mcgs = &rf->allocated_ahs[BITS_TO_LONGS(rf->max_ah)];
@@ -1988,12 +2046,14 @@ static u32 irdma_calc_mem_rsrc_size(struct irdma_pci_f *rf)
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_qp);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mr);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_cq);
+	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_srq);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_pd);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->arp_table_size);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_ah);
 	rsrc_size += sizeof(unsigned long) * BITS_TO_LONGS(rf->max_mcg);
 	rsrc_size += sizeof(struct irdma_qp **) * rf->max_qp;
 	rsrc_size += sizeof(struct irdma_cq **) * rf->max_cq;
+	rsrc_size += sizeof(struct irdma_srq **) * rf->max_srq;
 
 	return rsrc_size;
 }
@@ -2021,6 +2081,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
 	rf->max_qp = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt;
 	rf->max_mr = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_MR].cnt;
 	rf->max_cq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt;
+	rf->max_srq = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_SRQ].cnt;
 	rf->max_pd = rf->sc_dev.hw_attrs.max_hw_pds;
 	rf->arp_table_size = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_ARP].cnt;
 	rf->max_ah = rf->sc_dev.hmc_info->hmc_obj[IRDMA_HMC_IW_FSIAV].cnt;
@@ -2040,6 +2101,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf)
 	set_bit(0, rf->allocated_mrs);
 	set_bit(0, rf->allocated_qps);
 	set_bit(0, rf->allocated_cqs);
+	set_bit(0, rf->allocated_srqs);
 	set_bit(0, rf->allocated_pds);
 	set_bit(0, rf->allocated_arps);
 	set_bit(0, rf->allocated_ahs);
@@ -2100,15 +2162,16 @@ void irdma_cqp_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq)
 			cqp_request->compl_info.op_ret_val = info.op_ret_val;
 			cqp_request->compl_info.error = info.error;
 
-			if (cqp_request->waiting) {
-				WRITE_ONCE(cqp_request->request_done, true);
-				wake_up(&cqp_request->waitq);
-				irdma_put_cqp_request(&rf->cqp, cqp_request);
-			} else {
-				if (cqp_request->callback_fcn)
-					cqp_request->callback_fcn(cqp_request);
-				irdma_put_cqp_request(&rf->cqp, cqp_request);
-			}
+			/*
+			 * If this is deferred or pending completion, then mark
+			 * CQP request as pending to not block the CQ, but don't
+			 * release CQP request, as it is still on the OOO list.
+			 */
+			if (info.pending)
+				cqp_request->pending = true;
+			else
+				irdma_complete_cqp_request(&rf->cqp,
+							   cqp_request);
 		}
 
 		cqe_count++;
@@ -2718,7 +2781,9 @@ void irdma_flush_wqes(struct irdma_qp *iwqp, u32 flush_mask)
 	struct irdma_pci_f *rf = iwqp->iwdev->rf;
 	u8 flush_code = iwqp->sc_qp.flush_code;
 
-	if (!(flush_mask & IRDMA_FLUSH_SQ) && !(flush_mask & IRDMA_FLUSH_RQ))
+	if ((!(flush_mask & IRDMA_FLUSH_SQ) &&
+	     !(flush_mask & IRDMA_FLUSH_RQ)) ||
+	    ((flush_mask & IRDMA_REFLUSH) && rf->rdma_ver >= IRDMA_GEN_3))
 		return;
 
 	/* Set flush info fields*/
@@ -2731,6 +2796,10 @@ void irdma_flush_wqes(struct irdma_qp *iwqp, u32 flush_mask)
 	info.rq_major_code = IRDMA_FLUSH_MAJOR_ERR;
 	info.rq_minor_code = FLUSH_GENERAL_ERR;
 	info.userflushcode = true;
+	info.err_sq_idx_valid = iwqp->sc_qp.err_sq_idx_valid;
+	info.err_sq_idx = iwqp->sc_qp.err_sq_idx;
+	info.err_rq_idx_valid = iwqp->sc_qp.err_rq_idx_valid;
+	info.err_rq_idx = iwqp->sc_qp.err_rq_idx;
 
 	if (flush_mask & IRDMA_REFLUSH) {
 		if (info.sq)
diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.c b/drivers/infiniband/hw/irdma/i40iw_hw.c
index ce61a27cb1f6..60c1f2b1811d 100644
--- a/drivers/infiniband/hw/irdma/i40iw_hw.c
+++ b/drivers/infiniband/hw/irdma/i40iw_hw.c
@@ -85,6 +85,7 @@ static u64 i40iw_masks[IRDMA_MAX_MASKS] = {
 	I40E_CQPSQ_CQ_CEQID,
 	I40E_CQPSQ_CQ_CQID,
 	I40E_COMMIT_FPM_CQCNT,
+	I40E_CQPSQ_UPESD_HMCFNID,
 };
 
 static u64 i40iw_shifts[IRDMA_MAX_SHIFTS] = {
@@ -94,6 +95,7 @@ static u64 i40iw_shifts[IRDMA_MAX_SHIFTS] = {
 	I40E_CQPSQ_CQ_CEQID_S,
 	I40E_CQPSQ_CQ_CQID_S,
 	I40E_COMMIT_FPM_CQCNT_S,
+	I40E_CQPSQ_UPESD_HMCFNID_S,
 };
 
 /**
diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.h b/drivers/infiniband/hw/irdma/i40iw_hw.h
index e1db84d8a62c..0095b327afcc 100644
--- a/drivers/infiniband/hw/irdma/i40iw_hw.h
+++ b/drivers/infiniband/hw/irdma/i40iw_hw.h
@@ -123,6 +123,8 @@
 #define I40E_CQPSQ_CQ_CQID GENMASK_ULL(15, 0)
 #define I40E_COMMIT_FPM_CQCNT_S 0
 #define I40E_COMMIT_FPM_CQCNT GENMASK_ULL(17, 0)
+#define I40E_CQPSQ_UPESD_HMCFNID_S 0
+#define I40E_CQPSQ_UPESD_HMCFNID GENMASK_ULL(5, 0)
 
 #define I40E_VSIQF_CTL(_VSI)             (0x0020D800 + ((_VSI) * 4))
 
diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c
index cc50a7070371..15e036ddaffb 100644
--- a/drivers/infiniband/hw/irdma/i40iw_if.c
+++ b/drivers/infiniband/hw/irdma/i40iw_if.c
@@ -75,6 +75,9 @@ static void i40iw_fill_device_info(struct irdma_device *iwdev, struct i40e_info
 	struct irdma_pci_f *rf = iwdev->rf;
 
 	rf->rdma_ver = IRDMA_GEN_1;
+	rf->sc_dev.hw = &rf->hw;
+	rf->sc_dev.hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_1;
+	rf->sc_dev.privileged = true;
 	rf->gen_ops.request_reset = i40iw_request_reset;
 	rf->pcidev = cdev_info->pcidev;
 	rf->pf_id = cdev_info->fid;
diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.c b/drivers/infiniband/hw/irdma/icrdma_hw.c
index 941d3edffadb..32f26284a788 100644
--- a/drivers/infiniband/hw/irdma/icrdma_hw.c
+++ b/drivers/infiniband/hw/irdma/icrdma_hw.c
@@ -38,6 +38,7 @@ static u64 icrdma_masks[IRDMA_MAX_MASKS] = {
 	ICRDMA_CQPSQ_CQ_CEQID,
 	ICRDMA_CQPSQ_CQ_CQID,
 	ICRDMA_COMMIT_FPM_CQCNT,
+	ICRDMA_CQPSQ_UPESD_HMCFNID,
 };
 
 static u64 icrdma_shifts[IRDMA_MAX_SHIFTS] = {
@@ -47,6 +48,7 @@ static u64 icrdma_shifts[IRDMA_MAX_SHIFTS] = {
 	ICRDMA_CQPSQ_CQ_CEQID_S,
 	ICRDMA_CQPSQ_CQ_CQID_S,
 	ICRDMA_COMMIT_FPM_CQCNT_S,
+	ICRDMA_CQPSQ_UPESD_HMCFNID_S,
 };
 
 /**
@@ -194,6 +196,7 @@ void icrdma_init_hw(struct irdma_sc_dev *dev)
 	dev->hw_attrs.max_hw_ord = ICRDMA_MAX_ORD_SIZE;
 	dev->hw_attrs.max_stat_inst = ICRDMA_MAX_STATS_COUNT;
 	dev->hw_attrs.max_stat_idx = IRDMA_HW_STAT_INDEX_MAX_GEN_2;
+	dev->hw_attrs.max_hw_device_pages = ICRDMA_MAX_PUSH_PAGE_COUNT;
 
 	dev->hw_attrs.uk_attrs.min_hw_wq_size = ICRDMA_MIN_WQ_SIZE;
 	dev->hw_attrs.uk_attrs.max_hw_sq_chunk = IRDMA_MAX_QUANTA_PER_WR;
diff --git a/drivers/infiniband/hw/irdma/icrdma_hw.h b/drivers/infiniband/hw/irdma/icrdma_hw.h
index 697b9572b5c6..d97944ab45da 100644
--- a/drivers/infiniband/hw/irdma/icrdma_hw.h
+++ b/drivers/infiniband/hw/irdma/icrdma_hw.h
@@ -58,14 +58,15 @@
 #define ICRDMA_CQPSQ_CQ_CQID GENMASK_ULL(18, 0)
 #define ICRDMA_COMMIT_FPM_CQCNT_S 0
 #define ICRDMA_COMMIT_FPM_CQCNT GENMASK_ULL(19, 0)
-
+#define ICRDMA_CQPSQ_UPESD_HMCFNID_S 0
+#define ICRDMA_CQPSQ_UPESD_HMCFNID GENMASK_ULL(5, 0)
 enum icrdma_device_caps_const {
 	ICRDMA_MAX_STATS_COUNT = 128,
 
 	ICRDMA_MAX_IRD_SIZE			= 127,
 	ICRDMA_MAX_ORD_SIZE			= 255,
 	ICRDMA_MIN_WQ_SIZE                      = 8 /* WQEs */,
-
+	ICRDMA_MAX_PUSH_PAGE_COUNT		= 256,
 };
 
 void icrdma_init_hw(struct irdma_sc_dev *dev);
diff --git a/drivers/infiniband/hw/irdma/icrdma_if.c b/drivers/infiniband/hw/irdma/icrdma_if.c
new file mode 100644
index 000000000000..27b191f61caf
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/icrdma_if.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2015 - 2024 Intel Corporation */
+
+#include "main.h"
+#include <linux/net/intel/iidc_rdma_ice.h>
+
+static void icrdma_prep_tc_change(struct irdma_device *iwdev)
+{
+	iwdev->vsi.tc_change_pending = true;
+	irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_SUSPEND);
+
+	/* Wait for all qp's to suspend */
+	wait_event_timeout(iwdev->suspend_wq,
+			   !atomic_read(&iwdev->vsi.qp_suspend_reqs),
+			   msecs_to_jiffies(IRDMA_EVENT_TIMEOUT_MS));
+	irdma_ws_reset(&iwdev->vsi);
+}
+
+static void icrdma_fill_qos_info(struct irdma_l2params *l2params,
+			 struct iidc_rdma_qos_params *qos_info)
+{
+	int i;
+
+	l2params->num_tc = qos_info->num_tc;
+	l2params->vsi_prio_type = qos_info->vport_priority_type;
+	l2params->vsi_rel_bw = qos_info->vport_relative_bw;
+	for (i = 0; i < l2params->num_tc; i++) {
+		l2params->tc_info[i].egress_virt_up =
+			qos_info->tc_info[i].egress_virt_up;
+		l2params->tc_info[i].ingress_virt_up =
+			qos_info->tc_info[i].ingress_virt_up;
+		l2params->tc_info[i].prio_type = qos_info->tc_info[i].prio_type;
+		l2params->tc_info[i].rel_bw = qos_info->tc_info[i].rel_bw;
+		l2params->tc_info[i].tc_ctx = qos_info->tc_info[i].tc_ctx;
+	}
+	for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++)
+		l2params->up2tc[i] = qos_info->up2tc[i];
+	if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) {
+		l2params->dscp_mode = true;
+		memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map));
+	}
+}
+
+static void icrdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info,
+				     struct iidc_rdma_event *event)
+{
+	struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev);
+	struct irdma_l2params l2params = {};
+
+	if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) {
+		ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu);
+		if (iwdev->vsi.mtu != iwdev->netdev->mtu) {
+			l2params.mtu = iwdev->netdev->mtu;
+			l2params.mtu_changed = true;
+			irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev);
+			irdma_change_l2params(&iwdev->vsi, &l2params);
+		}
+	} else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) {
+		if (iwdev->vsi.tc_change_pending)
+			return;
+
+		icrdma_prep_tc_change(iwdev);
+	} else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) {
+		struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv;
+
+		if (!iwdev->vsi.tc_change_pending)
+			return;
+
+		l2params.tc_changed = true;
+		ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n");
+
+		icrdma_fill_qos_info(&l2params, &idc_priv->qos_info);
+		if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY)
+			iwdev->dcb_vlan_mode =
+				l2params.num_tc > 1 && !l2params.dscp_mode;
+		irdma_change_l2params(&iwdev->vsi, &l2params);
+	} else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) {
+		ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n",
+			   event->reg);
+		if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) {
+			u32 pe_criterr;
+
+			pe_criterr = readl(iwdev->rf->sc_dev.hw_regs[IRDMA_GLPE_CRITERR]);
+#define IRDMA_Q1_RESOURCE_ERR 0x0001024d
+			if (pe_criterr != IRDMA_Q1_RESOURCE_ERR) {
+				ibdev_err(&iwdev->ibdev, "critical PE Error, GLPE_CRITERR=0x%08x\n",
+					  pe_criterr);
+				iwdev->rf->reset = true;
+			} else {
+				ibdev_warn(&iwdev->ibdev, "Q1 Resource Check\n");
+			}
+		}
+		if (event->reg & IRDMAPFINT_OICR_HMC_ERR_M) {
+			ibdev_err(&iwdev->ibdev, "HMC Error\n");
+			iwdev->rf->reset = true;
+		}
+		if (event->reg & IRDMAPFINT_OICR_PE_PUSH_M) {
+			ibdev_err(&iwdev->ibdev, "PE Push Error\n");
+			iwdev->rf->reset = true;
+		}
+		if (iwdev->rf->reset)
+			iwdev->rf->gen_ops.request_reset(iwdev->rf);
+	}
+}
+
+/**
+ * icrdma_lan_register_qset - Register qset with LAN driver
+ * @vsi: vsi structure
+ * @tc_node: Traffic class node
+ */
+static int icrdma_lan_register_qset(struct irdma_sc_vsi *vsi,
+				    struct irdma_ws_node *tc_node)
+{
+	struct irdma_device *iwdev = vsi->back_vsi;
+	struct iidc_rdma_core_dev_info *cdev_info = iwdev->rf->cdev;
+	struct iidc_rdma_qset_params qset = {};
+	int ret;
+
+	qset.qs_handle = tc_node->qs_handle;
+	qset.tc = tc_node->traffic_class;
+	qset.vport_id = vsi->vsi_idx;
+	ret = ice_add_rdma_qset(cdev_info, &qset);
+	if (ret) {
+		ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n");
+		return ret;
+	}
+
+	tc_node->l2_sched_node_id = qset.teid;
+	vsi->qos[tc_node->user_pri].l2_sched_node_id = qset.teid;
+
+	return 0;
+}
+
+/**
+ * icrdma_lan_unregister_qset - Unregister qset with LAN driver
+ * @vsi: vsi structure
+ * @tc_node: Traffic class node
+ */
+static void icrdma_lan_unregister_qset(struct irdma_sc_vsi *vsi,
+				       struct irdma_ws_node *tc_node)
+{
+	struct irdma_device *iwdev = vsi->back_vsi;
+	struct iidc_rdma_core_dev_info *cdev_info = iwdev->rf->cdev;
+	struct iidc_rdma_qset_params qset = {};
+
+	qset.qs_handle = tc_node->qs_handle;
+	qset.tc = tc_node->traffic_class;
+	qset.vport_id = vsi->vsi_idx;
+	qset.teid = tc_node->l2_sched_node_id;
+
+	if (ice_del_rdma_qset(cdev_info, &qset))
+		ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n");
+}
+
+/**
+ * icrdma_request_reset - Request a reset
+ * @rf: RDMA PCI function
+ */
+static void icrdma_request_reset(struct irdma_pci_f *rf)
+{
+	ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n");
+	ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET);
+}
+
+static int icrdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev)
+{
+	int i;
+
+	rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX;
+	rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries),
+				   GFP_KERNEL);
+	if (!rf->msix_entries)
+		return -ENOMEM;
+
+	for (i = 0; i < rf->msix_count; i++)
+		if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i]))
+			break;
+
+	if (i < IRDMA_MIN_MSIX) {
+		while (--i >= 0)
+			ice_free_rdma_qvector(cdev, &rf->msix_entries[i]);
+
+		kfree(rf->msix_entries);
+		return -ENOMEM;
+	}
+
+	rf->msix_count = i;
+
+	return 0;
+}
+
+static void icrdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev)
+{
+	int i;
+
+	for (i = 0; i < rf->msix_count; i++)
+		ice_free_rdma_qvector(cdev, &rf->msix_entries[i]);
+
+	kfree(rf->msix_entries);
+}
+
+static void icrdma_fill_device_info(struct irdma_device *iwdev,
+				    struct iidc_rdma_core_dev_info *cdev_info)
+{
+	struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv;
+	struct irdma_pci_f *rf = iwdev->rf;
+
+	rf->sc_dev.hw = &rf->hw;
+	rf->iwdev = iwdev;
+	rf->cdev = cdev_info;
+	rf->hw.hw_addr = idc_priv->hw_addr;
+	rf->pcidev = cdev_info->pdev;
+	rf->hw.device = &rf->pcidev->dev;
+	rf->pf_id = idc_priv->pf_id;
+	rf->rdma_ver = IRDMA_GEN_2;
+	rf->sc_dev.hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_2;
+	rf->sc_dev.is_pf = true;
+	rf->sc_dev.privileged = true;
+
+	rf->gen_ops.register_qset = icrdma_lan_register_qset;
+	rf->gen_ops.unregister_qset = icrdma_lan_unregister_qset;
+
+	rf->default_vsi.vsi_idx = idc_priv->vport_id;
+	rf->protocol_used =
+		cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ?
+			IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY;
+	rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT;
+	rf->rst_to = IRDMA_RST_TIMEOUT_HZ;
+	rf->gen_ops.request_reset = icrdma_request_reset;
+	rf->limits_sel = 7;
+	mutex_init(&rf->ah_tbl_lock);
+
+	iwdev->netdev = idc_priv->netdev;
+	iwdev->vsi_num = idc_priv->vport_id;
+	iwdev->init_state = INITIAL_STATE;
+	iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT;
+	iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT;
+	iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED;
+	iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE;
+	if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY)
+		iwdev->roce_mode = true;
+}
+
+static int icrdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id)
+{
+	struct iidc_rdma_core_auxiliary_dev *iidc_adev;
+	struct iidc_rdma_core_dev_info *cdev_info;
+	struct iidc_rdma_priv_dev_info *idc_priv;
+	struct irdma_l2params l2params = {};
+	struct irdma_device *iwdev;
+	struct irdma_pci_f *rf;
+	int err;
+
+	iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
+	cdev_info = iidc_adev->cdev_info;
+	idc_priv = cdev_info->iidc_priv;
+
+	iwdev = ib_alloc_device(irdma_device, ibdev);
+	if (!iwdev)
+		return -ENOMEM;
+	iwdev->rf = kzalloc(sizeof(*rf), GFP_KERNEL);
+	if (!iwdev->rf) {
+		ib_dealloc_device(&iwdev->ibdev);
+		return -ENOMEM;
+	}
+
+	icrdma_fill_device_info(iwdev, cdev_info);
+	rf = iwdev->rf;
+
+	err = icrdma_init_interrupts(rf, cdev_info);
+	if (err)
+		goto err_init_interrupts;
+
+	err = irdma_ctrl_init_hw(rf);
+	if (err)
+		goto err_ctrl_init;
+
+	l2params.mtu = iwdev->netdev->mtu;
+	icrdma_fill_qos_info(&l2params, &idc_priv->qos_info);
+	if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY)
+		iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode;
+
+	err = irdma_rt_init_hw(iwdev, &l2params);
+	if (err)
+		goto err_rt_init;
+
+	err = irdma_ib_register_device(iwdev);
+	if (err)
+		goto err_ibreg;
+
+	ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true);
+
+	ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn));
+	auxiliary_set_drvdata(aux_dev, iwdev);
+
+	return 0;
+
+err_ibreg:
+	irdma_rt_deinit_hw(iwdev);
+err_rt_init:
+	irdma_ctrl_deinit_hw(rf);
+err_ctrl_init:
+	icrdma_deinit_interrupts(rf, cdev_info);
+err_init_interrupts:
+	kfree(iwdev->rf);
+	ib_dealloc_device(&iwdev->ibdev);
+
+	return err;
+}
+
+static void icrdma_remove(struct auxiliary_device *aux_dev)
+{
+	struct iidc_rdma_core_auxiliary_dev *idc_adev =
+		container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
+	struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info;
+	struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev);
+	u8 rdma_ver = iwdev->rf->rdma_ver;
+
+	ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false);
+	irdma_ib_unregister_device(iwdev);
+	icrdma_deinit_interrupts(iwdev->rf, cdev_info);
+
+	pr_debug("INIT: Gen[%d] func[%d] device remove success\n",
+		 rdma_ver, PCI_FUNC(cdev_info->pdev->devfn));
+}
+
+static const struct auxiliary_device_id icrdma_auxiliary_id_table[] = {
+	{.name = "ice.iwarp", },
+	{.name = "ice.roce", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, icrdma_auxiliary_id_table);
+
+struct iidc_rdma_core_auxiliary_drv icrdma_core_auxiliary_drv = {
+	.adrv = {
+	    .name = "gen_2",
+	    .id_table = icrdma_auxiliary_id_table,
+	    .probe = icrdma_probe,
+	    .remove = icrdma_remove,
+	},
+	.event_handler = icrdma_iidc_event_handler,
+};
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_hw.c b/drivers/infiniband/hw/irdma/ig3rdma_hw.c
new file mode 100644
index 000000000000..2e8bb475e22a
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/ig3rdma_hw.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2018 - 2024 Intel Corporation */
+#include "osdep.h"
+#include "type.h"
+#include "protos.h"
+#include "ig3rdma_hw.h"
+
+/**
+ * ig3rdma_ena_irq - Enable interrupt
+ * @dev: pointer to the device structure
+ * @idx: vector index
+ */
+static void ig3rdma_ena_irq(struct irdma_sc_dev *dev, u32 idx)
+{
+	u32 val;
+	u32 int_stride = 1; /* one u32 per register */
+
+	if (dev->is_pf)
+		int_stride = 0x400;
+	else
+		idx--; /* VFs use DYN_CTL_N */
+
+	val = FIELD_PREP(IRDMA_GLINT_DYN_CTL_INTENA, 1) |
+	      FIELD_PREP(IRDMA_GLINT_DYN_CTL_CLEARPBA, 1);
+
+	writel(val, dev->hw_regs[IRDMA_GLINT_DYN_CTL] + (idx * int_stride));
+}
+
+/**
+ * ig3rdma_disable_irq - Disable interrupt
+ * @dev: pointer to the device structure
+ * @idx: vector index
+ */
+static void ig3rdma_disable_irq(struct irdma_sc_dev *dev, u32 idx)
+{
+	u32 int_stride = 1; /* one u32 per register */
+
+	if (dev->is_pf)
+		int_stride = 0x400;
+	else
+		idx--; /* VFs use DYN_CTL_N */
+
+	writel(0, dev->hw_regs[IRDMA_GLINT_DYN_CTL] + (idx * int_stride));
+}
+
+static const struct irdma_irq_ops ig3rdma_irq_ops = {
+	.irdma_dis_irq = ig3rdma_disable_irq,
+	.irdma_en_irq = ig3rdma_ena_irq,
+};
+
+static const struct irdma_hw_stat_map ig3rdma_hw_stat_map[] = {
+	[IRDMA_HW_STAT_INDEX_RXVLANERR] =	{   0, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXOCTS] =	{   8, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXPKTS] =	{  16, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXDISCARD] =	{  24, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXTRUNC] =	{  32, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXFRAGS] =	{  40, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXMCOCTS] =	{  48, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4RXMCPKTS] =	{  56, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXOCTS] =	{  64, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXPKTS] =	{  72, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXDISCARD] =	{  80, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXTRUNC] =	{  88, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXFRAGS] =	{  96, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXMCOCTS] =	{ 104, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6RXMCPKTS] =	{ 112, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXOCTS] =	{ 120, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXPKTS] =	{ 128, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXFRAGS] =	{ 136, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXMCOCTS] =	{ 144, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXMCPKTS] =	{ 152, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXOCTS] =	{ 160, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXPKTS] =	{ 168, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXFRAGS] =	{ 176, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXMCOCTS] =	{ 184, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXMCPKTS] =	{ 192, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP4TXNOROUTE] =	{ 200, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_IP6TXNOROUTE] =	{ 208, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TCPRTXSEG] =	{ 216, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TCPRXOPTERR] =	{ 224, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TCPRXPROTOERR] =	{ 232, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TCPTXSEG] =	{ 240, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TCPRXSEGS] =	{ 248, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_UDPRXPKTS] =	{ 256, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_UDPTXPKTS] =	{ 264, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMARXWRS] =	{ 272, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMARXRDS] =	{ 280, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMARXSNDS] =	{ 288, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMATXWRS] =	{ 296, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMATXRDS] =	{ 304, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMATXSNDS] =	{ 312, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMAVBND] =	{ 320, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMAVINV] =	{ 328, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RXNPECNMARKEDPKTS] = { 336, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RXRPCNPHANDLED] =	{ 344, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RXRPCNPIGNORED] =	{ 352, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_TXNPCNPSENT] =	{ 360, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RNR_SENT] =	{ 368, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RNR_RCVD] =	{ 376, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT] =	{ 384, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT] =	{ 392, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMARXATS] =	{ 408, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RDMATXATS] =	{ 416, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_NAKSEQERR] =	{ 424, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED] = { 432, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RTO] =		{ 440, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_RXOOOPKTS] =	{ 448, 0, 0 },
+	[IRDMA_HW_STAT_INDEX_ICRCERR] =		{ 456, 0, 0 },
+};
+
+void ig3rdma_init_hw(struct irdma_sc_dev *dev)
+{
+	dev->irq_ops = &ig3rdma_irq_ops;
+	dev->hw_stats_map = ig3rdma_hw_stat_map;
+
+	dev->hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_3;
+	dev->hw_attrs.uk_attrs.max_hw_wq_frags = IG3RDMA_MAX_WQ_FRAGMENT_COUNT;
+	dev->hw_attrs.uk_attrs.max_hw_read_sges = IG3RDMA_MAX_SGE_RD;
+	dev->hw_attrs.uk_attrs.max_hw_sq_chunk = IRDMA_MAX_QUANTA_PER_WR;
+	dev->hw_attrs.first_hw_vf_fpm_id = 0;
+	dev->hw_attrs.max_hw_vf_fpm_id = IG3_MAX_APFS + IG3_MAX_AVFS;
+	dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_64_BYTE_CQE;
+	dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_CQE_TIMESTAMPING;
+
+	dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_SRQ;
+	dev->hw_attrs.uk_attrs.feature_flags |= IRDMA_FEATURE_RTS_AE |
+						IRDMA_FEATURE_CQ_RESIZE;
+	dev->hw_attrs.page_size_cap = SZ_4K | SZ_2M | SZ_1G;
+	dev->hw_attrs.max_hw_ird = IG3RDMA_MAX_IRD_SIZE;
+	dev->hw_attrs.max_hw_ord = IG3RDMA_MAX_ORD_SIZE;
+	dev->hw_attrs.max_stat_inst = IG3RDMA_MAX_STATS_COUNT;
+	dev->hw_attrs.max_stat_idx = IRDMA_HW_STAT_INDEX_MAX_GEN_3;
+	dev->hw_attrs.uk_attrs.min_hw_wq_size = IG3RDMA_MIN_WQ_SIZE;
+	dev->hw_attrs.uk_attrs.max_hw_srq_quanta = IRDMA_SRQ_MAX_QUANTA;
+	dev->hw_attrs.uk_attrs.max_hw_inline = IG3RDMA_MAX_INLINE_DATA_SIZE;
+	dev->hw_attrs.max_hw_device_pages =
+		dev->is_pf ? IG3RDMA_MAX_PF_PUSH_PAGE_COUNT : IG3RDMA_MAX_VF_PUSH_PAGE_COUNT;
+}
+
+static void __iomem *__ig3rdma_get_reg_addr(struct irdma_mmio_region *region, u64 reg_offset)
+{
+	if (reg_offset >= region->offset &&
+	    reg_offset < (region->offset + region->len)) {
+		reg_offset -= region->offset;
+
+		return region->addr + reg_offset;
+	}
+
+	return NULL;
+}
+
+void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset)
+{
+	u8 __iomem *reg_addr;
+	int i;
+
+	reg_addr = __ig3rdma_get_reg_addr(&hw->rdma_reg, reg_offset);
+	if (reg_addr)
+		return reg_addr;
+
+	for (i = 0; i < hw->num_io_regions; i++) {
+		reg_addr = __ig3rdma_get_reg_addr(&hw->io_regs[i], reg_offset);
+		if (reg_addr)
+			return reg_addr;
+	}
+
+	WARN_ON_ONCE(1);
+
+	return NULL;
+}
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_hw.h b/drivers/infiniband/hw/irdma/ig3rdma_hw.h
new file mode 100644
index 000000000000..03d5f1188789
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/ig3rdma_hw.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2021 - 2024 Intel Corporation */
+#ifndef IG3RDMA_HW_H
+#define IG3RDMA_HW_H
+
+#define IG3_MAX_APFS 1
+#define IG3_MAX_AVFS 0
+
+#define IG3_PF_RDMA_REGION_OFFSET 0xBC00000
+#define IG3_PF_RDMA_REGION_LEN 0x401000
+#define IG3_VF_RDMA_REGION_OFFSET 0x8C00
+#define IG3_VF_RDMA_REGION_LEN 0x8400
+
+enum ig3rdma_device_caps_const {
+	IG3RDMA_MAX_WQ_FRAGMENT_COUNT		= 14,
+	IG3RDMA_MAX_SGE_RD			= 14,
+
+	IG3RDMA_MAX_STATS_COUNT			= 128,
+
+	IG3RDMA_MAX_IRD_SIZE			= 64,
+	IG3RDMA_MAX_ORD_SIZE			= 64,
+	IG3RDMA_MIN_WQ_SIZE			= 16 /* WQEs */,
+	IG3RDMA_MAX_INLINE_DATA_SIZE		= 216,
+	IG3RDMA_MAX_PF_PUSH_PAGE_COUNT		= 8192,
+	IG3RDMA_MAX_VF_PUSH_PAGE_COUNT		= 16,
+};
+
+void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset);
+int ig3rdma_vchnl_send_sync(struct irdma_sc_dev *dev, u8 *msg, u16 len,
+			    u8 *recv_msg, u16 *recv_len);
+
+#endif /* IG3RDMA_HW_H*/
diff --git a/drivers/infiniband/hw/irdma/ig3rdma_if.c b/drivers/infiniband/hw/irdma/ig3rdma_if.c
new file mode 100644
index 000000000000..1bb42eb298ba
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/ig3rdma_if.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2023 - 2024 Intel Corporation */
+
+#include "main.h"
+#include <linux/net/intel/iidc_rdma_idpf.h>
+#include "ig3rdma_hw.h"
+
+static void ig3rdma_idc_core_event_handler(struct iidc_rdma_core_dev_info *cdev_info,
+					   struct iidc_rdma_event *event)
+{
+	struct irdma_pci_f *rf = auxiliary_get_drvdata(cdev_info->adev);
+
+	if (*event->type & BIT(IIDC_RDMA_EVENT_WARN_RESET)) {
+		rf->reset = true;
+		rf->sc_dev.vchnl_up = false;
+	}
+}
+
+int ig3rdma_vchnl_send_sync(struct irdma_sc_dev *dev, u8 *msg, u16 len,
+			    u8 *recv_msg, u16 *recv_len)
+{
+	struct iidc_rdma_core_dev_info *cdev_info = dev_to_rf(dev)->cdev;
+	int ret;
+
+	ret = idpf_idc_rdma_vc_send_sync(cdev_info, msg, len, recv_msg,
+					 recv_len);
+	if (ret == -ETIMEDOUT) {
+		ibdev_err(&(dev_to_rf(dev)->iwdev->ibdev),
+			  "Virtual channel Req <-> Resp completion timeout\n");
+		dev->vchnl_up = false;
+	}
+
+	return ret;
+}
+
+static int ig3rdma_vchnl_init(struct irdma_pci_f *rf,
+			      struct iidc_rdma_core_dev_info *cdev_info,
+			      u8 *rdma_ver)
+{
+	struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv;
+	struct irdma_vchnl_init_info virt_info;
+	u8 gen = rf->rdma_ver;
+	int ret;
+
+	rf->vchnl_wq = alloc_ordered_workqueue("irdma-virtchnl-wq", 0);
+	if (!rf->vchnl_wq)
+		return -ENOMEM;
+
+	mutex_init(&rf->sc_dev.vchnl_mutex);
+
+	virt_info.is_pf = !idc_priv->ftype;
+	virt_info.hw_rev = gen;
+	virt_info.privileged = gen == IRDMA_GEN_2;
+	virt_info.vchnl_wq = rf->vchnl_wq;
+	ret = irdma_sc_vchnl_init(&rf->sc_dev, &virt_info);
+	if (ret) {
+		destroy_workqueue(rf->vchnl_wq);
+		return ret;
+	}
+
+	*rdma_ver = rf->sc_dev.hw_attrs.uk_attrs.hw_rev;
+
+	return 0;
+}
+
+/**
+ * ig3rdma_request_reset - Request a reset
+ * @rf: RDMA PCI function
+ */
+static void ig3rdma_request_reset(struct irdma_pci_f *rf)
+{
+	ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n");
+	idpf_idc_request_reset(rf->cdev, IIDC_FUNC_RESET);
+}
+
+static int ig3rdma_cfg_regions(struct irdma_hw *hw,
+			       struct iidc_rdma_core_dev_info *cdev_info)
+{
+	struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv;
+	struct pci_dev *pdev = cdev_info->pdev;
+	int i;
+
+	switch (idc_priv->ftype) {
+	case IIDC_FUNCTION_TYPE_PF:
+		hw->rdma_reg.len = IG3_PF_RDMA_REGION_LEN;
+		hw->rdma_reg.offset = IG3_PF_RDMA_REGION_OFFSET;
+		break;
+	case IIDC_FUNCTION_TYPE_VF:
+		hw->rdma_reg.len = IG3_VF_RDMA_REGION_LEN;
+		hw->rdma_reg.offset = IG3_VF_RDMA_REGION_OFFSET;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	hw->rdma_reg.addr = ioremap(pci_resource_start(pdev, 0) + hw->rdma_reg.offset,
+				    hw->rdma_reg.len);
+
+	if (!hw->rdma_reg.addr)
+		return -ENOMEM;
+
+	hw->num_io_regions = le16_to_cpu(idc_priv->num_memory_regions);
+	hw->io_regs = kcalloc(hw->num_io_regions,
+			      sizeof(struct irdma_mmio_region), GFP_KERNEL);
+
+	if (!hw->io_regs) {
+		iounmap(hw->rdma_reg.addr);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < hw->num_io_regions; i++) {
+		hw->io_regs[i].addr =
+			idc_priv->mapped_mem_regions[i].region_addr;
+		hw->io_regs[i].len =
+			le64_to_cpu(idc_priv->mapped_mem_regions[i].size);
+		hw->io_regs[i].offset =
+			le64_to_cpu(idc_priv->mapped_mem_regions[i].start_offset);
+	}
+
+	return 0;
+}
+
+static void ig3rdma_decfg_rf(struct irdma_pci_f *rf)
+{
+	struct irdma_hw *hw = &rf->hw;
+
+	destroy_workqueue(rf->vchnl_wq);
+	kfree(hw->io_regs);
+	iounmap(hw->rdma_reg.addr);
+}
+
+static int ig3rdma_cfg_rf(struct irdma_pci_f *rf,
+			  struct iidc_rdma_core_dev_info *cdev_info)
+{
+	struct iidc_rdma_priv_dev_info *idc_priv = cdev_info->iidc_priv;
+	int err;
+
+	rf->sc_dev.hw = &rf->hw;
+	rf->cdev = cdev_info;
+	rf->pcidev = cdev_info->pdev;
+	rf->hw.device = &rf->pcidev->dev;
+	rf->msix_count = idc_priv->msix_count;
+	rf->msix_entries = idc_priv->msix_entries;
+
+	err = ig3rdma_vchnl_init(rf, cdev_info, &rf->rdma_ver);
+	if (err)
+		return err;
+
+	err = ig3rdma_cfg_regions(&rf->hw, cdev_info);
+	if (err) {
+		destroy_workqueue(rf->vchnl_wq);
+		return err;
+	}
+
+	rf->protocol_used = IRDMA_ROCE_PROTOCOL_ONLY;
+	rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT;
+	rf->rst_to = IRDMA_RST_TIMEOUT_HZ;
+	rf->gen_ops.request_reset = ig3rdma_request_reset;
+	rf->limits_sel = 7;
+	mutex_init(&rf->ah_tbl_lock);
+
+	return 0;
+}
+
+static int ig3rdma_core_probe(struct auxiliary_device *aux_dev,
+			      const struct auxiliary_device_id *id)
+{
+	struct iidc_rdma_core_auxiliary_dev *idc_adev =
+		container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
+	struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info;
+	struct irdma_pci_f *rf;
+	int err;
+
+	rf = kzalloc(sizeof(*rf), GFP_KERNEL);
+	if (!rf)
+		return -ENOMEM;
+
+	err = ig3rdma_cfg_rf(rf, cdev_info);
+	if (err)
+		goto err_cfg_rf;
+
+	err = irdma_ctrl_init_hw(rf);
+	if (err)
+		goto err_ctrl_init;
+
+	auxiliary_set_drvdata(aux_dev, rf);
+
+	err = idpf_idc_vport_dev_ctrl(cdev_info, true);
+	if (err)
+		goto err_vport_ctrl;
+
+	return 0;
+
+err_vport_ctrl:
+	irdma_ctrl_deinit_hw(rf);
+err_ctrl_init:
+	ig3rdma_decfg_rf(rf);
+err_cfg_rf:
+	kfree(rf);
+
+	return err;
+}
+
+static void ig3rdma_core_remove(struct auxiliary_device *aux_dev)
+{
+	struct iidc_rdma_core_auxiliary_dev *idc_adev =
+		container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
+	struct iidc_rdma_core_dev_info *cdev_info = idc_adev->cdev_info;
+	struct irdma_pci_f *rf = auxiliary_get_drvdata(aux_dev);
+
+	idpf_idc_vport_dev_ctrl(cdev_info, false);
+	irdma_ctrl_deinit_hw(rf);
+	ig3rdma_decfg_rf(rf);
+	kfree(rf);
+}
+
+static const struct auxiliary_device_id ig3rdma_core_auxiliary_id_table[] = {
+	{.name = "idpf.8086.rdma.core", },
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, ig3rdma_core_auxiliary_id_table);
+
+struct iidc_rdma_core_auxiliary_drv ig3rdma_core_auxiliary_drv = {
+	.adrv = {
+		.name = "core",
+		.id_table = ig3rdma_core_auxiliary_id_table,
+		.probe = ig3rdma_core_probe,
+		.remove = ig3rdma_core_remove,
+	},
+	.event_handler = ig3rdma_idc_core_event_handler,
+};
diff --git a/drivers/infiniband/hw/irdma/irdma.h b/drivers/infiniband/hw/irdma/irdma.h
index 20d2e7393e3d..ff938a01d70c 100644
--- a/drivers/infiniband/hw/irdma/irdma.h
+++ b/drivers/infiniband/hw/irdma/irdma.h
@@ -32,7 +32,16 @@
 #define IRDMA_PFHMC_SDDATALOW_PMSDDATALOW GENMASK(31, 12)
 #define IRDMA_PFHMC_SDCMD_PMSDWR BIT(31)
 
-#define IRDMA_INVALID_CQ_IDX			0xffffffff
+#define IRDMA_INVALID_CQ_IDX 0xffffffff
+#define IRDMA_Q_INVALID_IDX 0xffff
+
+enum irdma_dyn_idx_t {
+	IRDMA_IDX_ITR0 = 0,
+	IRDMA_IDX_ITR1 = 1,
+	IRDMA_IDX_ITR2 = 2,
+	IRDMA_IDX_NOITR = 3,
+};
+
 enum irdma_registers {
 	IRDMA_CQPTAIL,
 	IRDMA_CQPDB,
@@ -67,6 +76,7 @@ enum irdma_shifts {
 	IRDMA_CQPSQ_CQ_CEQID_S,
 	IRDMA_CQPSQ_CQ_CQID_S,
 	IRDMA_COMMIT_FPM_CQCNT_S,
+	IRDMA_CQPSQ_UPESD_HMCFNID_S,
 	IRDMA_MAX_SHIFTS,
 };
 
@@ -77,6 +87,7 @@ enum irdma_masks {
 	IRDMA_CQPSQ_CQ_CEQID_M,
 	IRDMA_CQPSQ_CQ_CQID_M,
 	IRDMA_COMMIT_FPM_CQCNT_M,
+	IRDMA_CQPSQ_UPESD_HMCFNID_M,
 	IRDMA_MAX_MASKS, /* Must be last entry */
 };
 
@@ -92,7 +103,7 @@ struct irdma_mcast_grp_ctx_entry_info {
 struct irdma_mcast_grp_info {
 	u8 dest_mac_addr[ETH_ALEN];
 	u16 vlan_id;
-	u8 hmc_fcn_id;
+	u16 hmc_fcn_id;
 	bool ipv4_valid:1;
 	bool vlan_valid:1;
 	u16 mg_id;
@@ -107,6 +118,9 @@ enum irdma_vers {
 	IRDMA_GEN_RSVD,
 	IRDMA_GEN_1,
 	IRDMA_GEN_2,
+	IRDMA_GEN_3,
+	IRDMA_GEN_NEXT,
+	IRDMA_GEN_MAX = IRDMA_GEN_NEXT-1
 };
 
 struct irdma_uk_attrs {
@@ -118,6 +132,7 @@ struct irdma_uk_attrs {
 	u32 max_hw_wq_quanta;
 	u32 min_hw_cq_size;
 	u32 max_hw_cq_size;
+	u32 max_hw_srq_quanta;
 	u16 max_hw_sq_chunk;
 	u16 min_hw_wq_size;
 	u8 hw_rev;
@@ -147,10 +162,13 @@ struct irdma_hw_attrs {
 	u32 max_done_count;
 	u32 max_sleep_count;
 	u32 max_cqp_compl_wait_time_ms;
+	u32 min_hw_srq_id;
 	u16 max_stat_inst;
 	u16 max_stat_idx;
 };
 
 void i40iw_init_hw(struct irdma_sc_dev *dev);
 void icrdma_init_hw(struct irdma_sc_dev *dev);
+void ig3rdma_init_hw(struct irdma_sc_dev *dev);
+void __iomem *ig3rdma_get_reg_addr(struct irdma_hw *hw, u64 reg_offset);
 #endif /* IRDMA_H*/
diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c
index 1e840bbd619d..95957d52883d 100644
--- a/drivers/infiniband/hw/irdma/main.c
+++ b/drivers/infiniband/hw/irdma/main.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2015 - 2021 Intel Corporation */
 #include "main.h"
+#include <linux/net/intel/iidc_rdma_idpf.h>
 
 MODULE_ALIAS("i40iw");
 MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA");
@@ -38,19 +39,7 @@ static void irdma_unregister_notifiers(void)
 	unregister_netdevice_notifier(&irdma_netdevice_notifier);
 }
 
-static void irdma_prep_tc_change(struct irdma_device *iwdev)
-{
-	iwdev->vsi.tc_change_pending = true;
-	irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_SUSPEND);
-
-	/* Wait for all qp's to suspend */
-	wait_event_timeout(iwdev->suspend_wq,
-			   !atomic_read(&iwdev->vsi.qp_suspend_reqs),
-			   msecs_to_jiffies(IRDMA_EVENT_TIMEOUT_MS));
-	irdma_ws_reset(&iwdev->vsi);
-}
-
-static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev)
+void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev)
 {
 	if (mtu < IRDMA_MIN_MTU_IPV4)
 		ibdev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 576 for IPv4\n", mtu);
@@ -58,35 +47,10 @@ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev)
 		ibdev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 1280 for IPv6\\n", mtu);
 }
 
-static void irdma_fill_qos_info(struct irdma_l2params *l2params,
-				struct iidc_rdma_qos_params *qos_info)
+static void ig3rdma_idc_vport_event_handler(struct iidc_rdma_vport_dev_info *cdev_info,
+					    struct iidc_rdma_event *event)
 {
-	int i;
-
-	l2params->num_tc = qos_info->num_tc;
-	l2params->vsi_prio_type = qos_info->vport_priority_type;
-	l2params->vsi_rel_bw = qos_info->vport_relative_bw;
-	for (i = 0; i < l2params->num_tc; i++) {
-		l2params->tc_info[i].egress_virt_up =
-			qos_info->tc_info[i].egress_virt_up;
-		l2params->tc_info[i].ingress_virt_up =
-			qos_info->tc_info[i].ingress_virt_up;
-		l2params->tc_info[i].prio_type = qos_info->tc_info[i].prio_type;
-		l2params->tc_info[i].rel_bw = qos_info->tc_info[i].rel_bw;
-		l2params->tc_info[i].tc_ctx = qos_info->tc_info[i].tc_ctx;
-	}
-	for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++)
-		l2params->up2tc[i] = qos_info->up2tc[i];
-	if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) {
-		l2params->dscp_mode = true;
-		memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map));
-	}
-}
-
-static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info,
-				     struct iidc_rdma_event *event)
-{
-	struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev);
+	struct irdma_device *iwdev = auxiliary_get_drvdata(cdev_info->adev);
 	struct irdma_l2params l2params = {};
 
 	if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) {
@@ -97,248 +61,39 @@ static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info,
 			irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev);
 			irdma_change_l2params(&iwdev->vsi, &l2params);
 		}
-	} else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) {
-		if (iwdev->vsi.tc_change_pending)
-			return;
-
-		irdma_prep_tc_change(iwdev);
-	} else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) {
-		struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv;
-
-		if (!iwdev->vsi.tc_change_pending)
-			return;
-
-		l2params.tc_changed = true;
-		ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n");
-
-		irdma_fill_qos_info(&l2params, &iidc_priv->qos_info);
-		if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY)
-			iwdev->dcb_vlan_mode =
-				l2params.num_tc > 1 && !l2params.dscp_mode;
-		irdma_change_l2params(&iwdev->vsi, &l2params);
-	} else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) {
-		ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n",
-			   event->reg);
-		if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) {
-			u32 pe_criterr;
-
-			pe_criterr = readl(iwdev->rf->sc_dev.hw_regs[IRDMA_GLPE_CRITERR]);
-#define IRDMA_Q1_RESOURCE_ERR 0x0001024d
-			if (pe_criterr != IRDMA_Q1_RESOURCE_ERR) {
-				ibdev_err(&iwdev->ibdev, "critical PE Error, GLPE_CRITERR=0x%08x\n",
-					  pe_criterr);
-				iwdev->rf->reset = true;
-			} else {
-				ibdev_warn(&iwdev->ibdev, "Q1 Resource Check\n");
-			}
-		}
-		if (event->reg & IRDMAPFINT_OICR_HMC_ERR_M) {
-			ibdev_err(&iwdev->ibdev, "HMC Error\n");
-			iwdev->rf->reset = true;
-		}
-		if (event->reg & IRDMAPFINT_OICR_PE_PUSH_M) {
-			ibdev_err(&iwdev->ibdev, "PE Push Error\n");
-			iwdev->rf->reset = true;
-		}
-		if (iwdev->rf->reset)
-			iwdev->rf->gen_ops.request_reset(iwdev->rf);
 	}
 }
 
-/**
- * irdma_request_reset - Request a reset
- * @rf: RDMA PCI function
- */
-static void irdma_request_reset(struct irdma_pci_f *rf)
-{
-	ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n");
-	ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET);
-}
-
-/**
- * irdma_lan_register_qset - Register qset with LAN driver
- * @vsi: vsi structure
- * @tc_node: Traffic class node
- */
-static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi,
-				   struct irdma_ws_node *tc_node)
-{
-	struct irdma_device *iwdev = vsi->back_vsi;
-	struct iidc_rdma_core_dev_info *cdev_info;
-	struct iidc_rdma_qset_params qset = {};
-	int ret;
-
-	cdev_info = iwdev->rf->cdev;
-	qset.qs_handle = tc_node->qs_handle;
-	qset.tc = tc_node->traffic_class;
-	qset.vport_id = vsi->vsi_idx;
-	ret = ice_add_rdma_qset(cdev_info, &qset);
-	if (ret) {
-		ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n");
-		return ret;
-	}
-
-	tc_node->l2_sched_node_id = qset.teid;
-	vsi->qos[tc_node->user_pri].l2_sched_node_id = qset.teid;
-
-	return 0;
-}
-
-/**
- * irdma_lan_unregister_qset - Unregister qset with LAN driver
- * @vsi: vsi structure
- * @tc_node: Traffic class node
- */
-static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi,
-				      struct irdma_ws_node *tc_node)
+static int ig3rdma_vport_probe(struct auxiliary_device *aux_dev,
+			       const struct auxiliary_device_id *id)
 {
-	struct irdma_device *iwdev = vsi->back_vsi;
-	struct iidc_rdma_core_dev_info *cdev_info;
-	struct iidc_rdma_qset_params qset = {};
-
-	cdev_info = iwdev->rf->cdev;
-	qset.qs_handle = tc_node->qs_handle;
-	qset.tc = tc_node->traffic_class;
-	qset.vport_id = vsi->vsi_idx;
-	qset.teid = tc_node->l2_sched_node_id;
-
-	if (ice_del_rdma_qset(cdev_info, &qset))
-		ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n");
-}
-
-static int irdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev)
-{
-	int i;
-
-	rf->msix_count = num_online_cpus() + IRDMA_NUM_AEQ_MSIX;
-	rf->msix_entries = kcalloc(rf->msix_count, sizeof(*rf->msix_entries),
-				   GFP_KERNEL);
-	if (!rf->msix_entries)
-		return -ENOMEM;
-
-	for (i = 0; i < rf->msix_count; i++)
-		if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i]))
-			break;
-
-	if (i < IRDMA_MIN_MSIX) {
-		while (--i >= 0)
-			ice_free_rdma_qvector(cdev, &rf->msix_entries[i]);
+	struct iidc_rdma_vport_auxiliary_dev *idc_adev =
+		container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev);
+	struct auxiliary_device *aux_core_dev = idc_adev->vdev_info->core_adev;
+	struct irdma_pci_f *rf = auxiliary_get_drvdata(aux_core_dev);
+	struct irdma_l2params l2params = {};
+	struct irdma_device *iwdev;
+	int err;
 
-		kfree(rf->msix_entries);
+	if (!rf) {
+		WARN_ON_ONCE(1);
 		return -ENOMEM;
 	}
-
-	rf->msix_count = i;
-
-	return 0;
-}
-
-static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev)
-{
-	int i;
-
-	for (i = 0; i < rf->msix_count; i++)
-		ice_free_rdma_qvector(cdev, &rf->msix_entries[i]);
-
-	kfree(rf->msix_entries);
-}
-
-static void irdma_remove(struct auxiliary_device *aux_dev)
-{
-	struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev);
-	struct iidc_rdma_core_auxiliary_dev *iidc_adev;
-	struct iidc_rdma_core_dev_info *cdev_info;
-
-	iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
-	cdev_info = iidc_adev->cdev_info;
-
-	ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false);
-	irdma_ib_unregister_device(iwdev);
-	irdma_deinit_interrupts(iwdev->rf, cdev_info);
-
-	kfree(iwdev->rf);
-
-	pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(cdev_info->pdev->devfn));
-}
-
-static void irdma_fill_device_info(struct irdma_device *iwdev,
-				   struct iidc_rdma_core_dev_info *cdev_info)
-{
-	struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv;
-	struct irdma_pci_f *rf = iwdev->rf;
-
-	rf->sc_dev.hw = &rf->hw;
-	rf->iwdev = iwdev;
-	rf->cdev = cdev_info;
-	rf->hw.hw_addr = iidc_priv->hw_addr;
-	rf->pcidev = cdev_info->pdev;
-	rf->hw.device = &rf->pcidev->dev;
-	rf->pf_id = iidc_priv->pf_id;
-	rf->gen_ops.register_qset = irdma_lan_register_qset;
-	rf->gen_ops.unregister_qset = irdma_lan_unregister_qset;
-
-	rf->default_vsi.vsi_idx = iidc_priv->vport_id;
-	rf->protocol_used =
-		cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ?
-		IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY;
-	rf->rdma_ver = IRDMA_GEN_2;
-	rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT;
-	rf->rst_to = IRDMA_RST_TIMEOUT_HZ;
-	rf->gen_ops.request_reset = irdma_request_reset;
-	rf->limits_sel = 7;
-	rf->iwdev = iwdev;
-
-	mutex_init(&iwdev->ah_tbl_lock);
-
-	iwdev->netdev = iidc_priv->netdev;
-	iwdev->vsi_num = iidc_priv->vport_id;
+	iwdev = ib_alloc_device(irdma_device, ibdev);
+	/* Fill iwdev info */
+	iwdev->is_vport = true;
+	iwdev->rf = rf;
+	iwdev->vport_id = idc_adev->vdev_info->vport_id;
+	iwdev->netdev = idc_adev->vdev_info->netdev;
 	iwdev->init_state = INITIAL_STATE;
 	iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT;
 	iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT;
 	iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED;
 	iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE;
-	if (rf->protocol_used == IRDMA_ROCE_PROTOCOL_ONLY)
-		iwdev->roce_mode = true;
-}
-
-static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id)
-{
-	struct iidc_rdma_core_auxiliary_dev *iidc_adev;
-	struct iidc_rdma_core_dev_info *cdev_info;
-	struct iidc_rdma_priv_dev_info *iidc_priv;
-	struct irdma_l2params l2params = {};
-	struct irdma_device *iwdev;
-	struct irdma_pci_f *rf;
-	int err;
-
-	iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev);
-	cdev_info = iidc_adev->cdev_info;
-	iidc_priv = cdev_info->iidc_priv;
-
-	iwdev = ib_alloc_device(irdma_device, ibdev);
-	if (!iwdev)
-		return -ENOMEM;
-	iwdev->rf = kzalloc(sizeof(*rf), GFP_KERNEL);
-	if (!iwdev->rf) {
-		ib_dealloc_device(&iwdev->ibdev);
-		return -ENOMEM;
-	}
-
-	irdma_fill_device_info(iwdev, cdev_info);
-	rf = iwdev->rf;
-
-	err = irdma_init_interrupts(rf, cdev_info);
-	if (err)
-		goto err_init_interrupts;
-
-	err = irdma_ctrl_init_hw(rf);
-	if (err)
-		goto err_ctrl_init;
+	iwdev->roce_mode = true;
+	iwdev->push_mode = false;
 
 	l2params.mtu = iwdev->netdev->mtu;
-	irdma_fill_qos_info(&l2params, &iidc_priv->qos_info);
-	if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY)
-		iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode;
 
 	err = irdma_rt_init_hw(iwdev, &l2params);
 	if (err)
@@ -348,43 +103,57 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_
 	if (err)
 		goto err_ibreg;
 
-	ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true);
-
-	ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn));
 	auxiliary_set_drvdata(aux_dev, iwdev);
 
-	return 0;
+	ibdev_dbg(&iwdev->ibdev,
+		  "INIT: Gen[%d] vport[%d] probe success. dev_name = %s, core_dev_name = %s, netdev=%s\n",
+		  rf->rdma_ver, idc_adev->vdev_info->vport_id,
+		  dev_name(&aux_dev->dev),
+		  dev_name(&idc_adev->vdev_info->core_adev->dev),
+		  netdev_name(idc_adev->vdev_info->netdev));
 
+	return 0;
 err_ibreg:
 	irdma_rt_deinit_hw(iwdev);
 err_rt_init:
-	irdma_ctrl_deinit_hw(rf);
-err_ctrl_init:
-	irdma_deinit_interrupts(rf, cdev_info);
-err_init_interrupts:
-	kfree(iwdev->rf);
 	ib_dealloc_device(&iwdev->ibdev);
 
 	return err;
 }
 
-static const struct auxiliary_device_id irdma_auxiliary_id_table[] = {
-	{.name = "ice.iwarp", },
-	{.name = "ice.roce", },
+static void ig3rdma_vport_remove(struct auxiliary_device *aux_dev)
+{
+	struct iidc_rdma_vport_auxiliary_dev *idc_adev =
+		container_of(aux_dev, struct iidc_rdma_vport_auxiliary_dev, adev);
+	struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev);
+
+	ibdev_dbg(&iwdev->ibdev,
+		  "INIT: Gen[%d] dev_name = %s, core_dev_name = %s, netdev=%s\n",
+		  iwdev->rf->rdma_ver, dev_name(&aux_dev->dev),
+		  dev_name(&idc_adev->vdev_info->core_adev->dev),
+		  netdev_name(idc_adev->vdev_info->netdev));
+
+	irdma_ib_unregister_device(iwdev);
+}
+
+static const struct auxiliary_device_id ig3rdma_vport_auxiliary_id_table[] = {
+	{.name = "idpf.8086.rdma.vdev", },
 	{},
 };
 
-MODULE_DEVICE_TABLE(auxiliary, irdma_auxiliary_id_table);
+MODULE_DEVICE_TABLE(auxiliary, ig3rdma_vport_auxiliary_id_table);
 
-static struct iidc_rdma_core_auxiliary_drv irdma_auxiliary_drv = {
+static struct iidc_rdma_vport_auxiliary_drv ig3rdma_vport_auxiliary_drv = {
 	.adrv = {
-	    .id_table = irdma_auxiliary_id_table,
-	    .probe = irdma_probe,
-	    .remove = irdma_remove,
+		.name = "vdev",
+		.id_table = ig3rdma_vport_auxiliary_id_table,
+		.probe = ig3rdma_vport_probe,
+		.remove = ig3rdma_vport_remove,
 	},
-	.event_handler = irdma_iidc_event_handler,
+	.event_handler = ig3rdma_idc_vport_event_handler,
 };
 
+
 static int __init irdma_init_module(void)
 {
 	int ret;
@@ -396,14 +165,34 @@ static int __init irdma_init_module(void)
 		return ret;
 	}
 
-	ret = auxiliary_driver_register(&irdma_auxiliary_drv.adrv);
+	ret = auxiliary_driver_register(&icrdma_core_auxiliary_drv.adrv);
+	if (ret) {
+		auxiliary_driver_unregister(&i40iw_auxiliary_drv);
+		pr_err("Failed icrdma(gen_2) auxiliary_driver_register() ret=%d\n",
+		       ret);
+		return ret;
+	}
+
+	ret = auxiliary_driver_register(&ig3rdma_core_auxiliary_drv.adrv);
 	if (ret) {
+		auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv);
 		auxiliary_driver_unregister(&i40iw_auxiliary_drv);
-		pr_err("Failed irdma auxiliary_driver_register() ret=%d\n",
+		pr_err("Failed ig3rdma(gen_3) core auxiliary_driver_register() ret=%d\n",
 		       ret);
+
 		return ret;
 	}
 
+	ret = auxiliary_driver_register(&ig3rdma_vport_auxiliary_drv.adrv);
+	if (ret) {
+		auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv);
+		auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv);
+		auxiliary_driver_unregister(&i40iw_auxiliary_drv);
+		pr_err("Failed ig3rdma vport auxiliary_driver_register() ret=%d\n",
+		       ret);
+
+		return ret;
+	}
 	irdma_register_notifiers();
 
 	return 0;
@@ -412,8 +201,10 @@ static int __init irdma_init_module(void)
 static void __exit irdma_exit_module(void)
 {
 	irdma_unregister_notifiers();
-	auxiliary_driver_unregister(&irdma_auxiliary_drv.adrv);
+	auxiliary_driver_unregister(&icrdma_core_auxiliary_drv.adrv);
 	auxiliary_driver_unregister(&i40iw_auxiliary_drv);
+	auxiliary_driver_unregister(&ig3rdma_core_auxiliary_drv.adrv);
+	auxiliary_driver_unregister(&ig3rdma_vport_auxiliary_drv.adrv);
 }
 
 module_init(irdma_init_module);
diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h
index 674acc952168..886b30da188a 100644
--- a/drivers/infiniband/hw/irdma/main.h
+++ b/drivers/infiniband/hw/irdma/main.h
@@ -30,7 +30,6 @@
 #endif
 #include <linux/auxiliary_bus.h>
 #include <linux/net/intel/iidc_rdma.h>
-#include <linux/net/intel/iidc_rdma_ice.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_pack.h>
@@ -54,6 +53,8 @@
 #include "puda.h"
 
 extern struct auxiliary_driver i40iw_auxiliary_drv;
+extern struct iidc_rdma_core_auxiliary_drv icrdma_core_auxiliary_drv;
+extern struct iidc_rdma_core_auxiliary_drv ig3rdma_core_auxiliary_drv;
 
 #define IRDMA_FW_VER_DEFAULT	2
 #define IRDMA_HW_VER	        2
@@ -65,7 +66,8 @@ extern struct auxiliary_driver i40iw_auxiliary_drv;
 #define IRDMA_MACIP_ADD		1
 #define IRDMA_MACIP_DELETE	2
 
-#define IW_CCQ_SIZE	(IRDMA_CQP_SW_SQSIZE_2048 + 1)
+#define IW_GEN_3_CCQ_SIZE  (2 * IRDMA_CQP_SW_SQSIZE_2048 + 2)
+#define IW_CCQ_SIZE	(IRDMA_CQP_SW_SQSIZE_2048 + 2)
 #define IW_CEQ_SIZE	2048
 #define IW_AEQ_SIZE	2048
 
@@ -127,12 +129,12 @@ enum init_completion_state {
 	HMC_OBJS_CREATED,
 	HW_RSRC_INITIALIZED,
 	CCQ_CREATED,
-	CEQ0_CREATED, /* Last state of probe */
-	ILQ_CREATED,
-	IEQ_CREATED,
+	CEQ0_CREATED,
 	CEQS_CREATED,
 	PBLE_CHUNK_MEM,
 	AEQ_CREATED,
+	ILQ_CREATED,
+	IEQ_CREATED, /* Last state of probe */
 	IP_ADDR_REGISTERED,  /* Last state of open */
 };
 
@@ -167,6 +169,7 @@ struct irdma_cqp_request {
 	bool request_done; /* READ/WRITE_ONCE macros operate on it */
 	bool waiting:1;
 	bool dynamic:1;
+	bool pending:1;
 };
 
 struct irdma_cqp {
@@ -179,6 +182,7 @@ struct irdma_cqp {
 	struct irdma_dma_mem host_ctx;
 	u64 *scratch_array;
 	struct irdma_cqp_request *cqp_requests;
+	struct irdma_ooo_cqp_op *oop_op_array;
 	struct list_head cqp_avail_reqs;
 	struct list_head cqp_pending_reqs;
 };
@@ -257,6 +261,7 @@ struct irdma_pci_f {
 	bool reset:1;
 	bool rsrc_created:1;
 	bool msix_shared:1;
+	bool hwqp1_rsvd:1;
 	u8 rsrc_profile;
 	u8 *hmc_info_mem;
 	u8 *mem_rsrc;
@@ -269,6 +274,8 @@ struct irdma_pci_f {
 	u32 max_mr;
 	u32 max_qp;
 	u32 max_cq;
+	u32 max_srq;
+	u32 next_srq;
 	u32 max_ah;
 	u32 next_ah;
 	u32 max_mcg;
@@ -282,6 +289,7 @@ struct irdma_pci_f {
 	u32 mr_stagmask;
 	u32 used_pds;
 	u32 used_cqs;
+	u32 used_srqs;
 	u32 used_mrs;
 	u32 used_qps;
 	u32 arp_table_size;
@@ -293,6 +301,7 @@ struct irdma_pci_f {
 	unsigned long *allocated_ws_nodes;
 	unsigned long *allocated_qps;
 	unsigned long *allocated_cqs;
+	unsigned long *allocated_srqs;
 	unsigned long *allocated_mrs;
 	unsigned long *allocated_pds;
 	unsigned long *allocated_mcgs;
@@ -327,10 +336,13 @@ struct irdma_pci_f {
 	wait_queue_head_t vchnl_waitq;
 	struct workqueue_struct *cqp_cmpl_wq;
 	struct work_struct cqp_cmpl_work;
+	struct workqueue_struct *vchnl_wq;
 	struct irdma_sc_vsi default_vsi;
 	void *back_fcn;
 	struct irdma_gen_ops gen_ops;
 	struct irdma_device *iwdev;
+	DECLARE_HASHTABLE(ah_hash_tbl, 8);
+	struct mutex ah_tbl_lock; /* protect AH hash table access */
 };
 
 struct irdma_device {
@@ -340,8 +352,6 @@ struct irdma_device {
 	struct workqueue_struct *cleanup_wq;
 	struct irdma_sc_vsi vsi;
 	struct irdma_cm_core cm_core;
-	DECLARE_HASHTABLE(ah_hash_tbl, 8);
-	struct mutex ah_tbl_lock; /* protect AH hash table access */
 	u32 roce_cwnd;
 	u32 roce_ackcreds;
 	u32 vendor_id;
@@ -350,12 +360,14 @@ struct irdma_device {
 	u32 rcv_wnd;
 	u16 mac_ip_table_idx;
 	u16 vsi_num;
+	u16 vport_id;
 	u8 rcv_wscale;
 	u8 iw_status;
 	bool roce_mode:1;
 	bool roce_dcqcn_en:1;
 	bool dcb_vlan_mode:1;
 	bool iw_ooo:1;
+	bool is_vport:1;
 	enum init_completion_state init_state;
 
 	wait_queue_head_t suspend_wq;
@@ -413,6 +425,11 @@ static inline struct irdma_pci_f *dev_to_rf(struct irdma_sc_dev *dev)
 	return container_of(dev, struct irdma_pci_f, sc_dev);
 }
 
+static inline struct irdma_srq *to_iwsrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct irdma_srq, ibsrq);
+}
+
 /**
  * irdma_alloc_resource - allocate a resource
  * @iwdev: device pointer
@@ -508,7 +525,8 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 void irdma_cq_add_ref(struct ib_cq *ibcq);
 void irdma_cq_rem_ref(struct ib_cq *ibcq);
 void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq);
-
+void irdma_srq_event(struct irdma_sc_srq *srq);
+void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq);
 void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf);
 int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp,
 		       struct irdma_modify_qp_info *info, bool wait);
@@ -557,4 +575,5 @@ int irdma_netdevice_event(struct notifier_block *notifier, unsigned long event,
 			  void *ptr);
 void irdma_add_ip(struct irdma_device *iwdev);
 void cqp_compl_worker(struct work_struct *work);
+void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev);
 #endif /* IRDMA_MAIN_H */
diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c
index 37ce35cb10e7..3091f9345f12 100644
--- a/drivers/infiniband/hw/irdma/pble.c
+++ b/drivers/infiniband/hw/irdma/pble.c
@@ -193,8 +193,15 @@ static enum irdma_sd_entry_type irdma_get_type(struct irdma_sc_dev *dev,
 {
 	enum irdma_sd_entry_type sd_entry_type;
 
-	sd_entry_type = !idx->rel_pd_idx && pages == IRDMA_HMC_PD_CNT_IN_SD ?
-			IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED;
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		sd_entry_type = (!idx->rel_pd_idx &&
+				 pages == IRDMA_HMC_PD_CNT_IN_SD) ?
+				 IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED;
+	else
+		sd_entry_type = (!idx->rel_pd_idx &&
+				 pages == IRDMA_HMC_PD_CNT_IN_SD &&
+				 dev->privileged) ?
+				 IRDMA_SD_TYPE_DIRECT : IRDMA_SD_TYPE_PAGED;
 	return sd_entry_type;
 }
 
@@ -279,10 +286,11 @@ static int add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc)
 	sd_reg_val = (sd_entry_type == IRDMA_SD_TYPE_PAGED) ?
 			     sd_entry->u.pd_table.pd_page_addr.pa :
 			     sd_entry->u.bp.addr.pa;
-
-	if (!sd_entry->valid) {
-		ret_code = irdma_hmc_sd_one(dev, hmc_info->hmc_fn_id, sd_reg_val,
-					    idx->sd_idx, sd_entry->entry_type, true);
+	if ((dev->privileged && !sd_entry->valid) ||
+	    dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		ret_code = irdma_hmc_sd_one(dev, hmc_info->hmc_fn_id,
+					    sd_reg_val, idx->sd_idx,
+					    sd_entry->entry_type, true);
 		if (ret_code)
 			goto error;
 	}
diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h
index c0c9441885d3..324cfbf21764 100644
--- a/drivers/infiniband/hw/irdma/protos.h
+++ b/drivers/infiniband/hw/irdma/protos.h
@@ -10,6 +10,7 @@
 #define ALL_TC2PFC		0xff
 #define CQP_COMPL_WAIT_TIME_MS	10
 #define CQP_TIMEOUT_THRESHOLD	500
+#define CQP_DEF_CMPL_TIMEOUT_THRESHOLD	2500
 
 /* init operations */
 int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev,
diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h
index 2fc638f2b143..d65041bee667 100644
--- a/drivers/infiniband/hw/irdma/puda.h
+++ b/drivers/infiniband/hw/irdma/puda.h
@@ -91,7 +91,7 @@ struct irdma_puda_rsrc_info {
 	u32 rq_size;
 	u32 tx_buf_cnt; /* total bufs allocated will be rq_size + tx_buf_cnt */
 	u16 buf_size;
-	u8 stats_idx;
+	u16 stats_idx;
 	bool stats_idx_valid:1;
 	int abi_ver;
 };
@@ -140,7 +140,7 @@ struct irdma_puda_rsrc {
 	u64 crc_err;
 	u64 pmode_count;
 	u64 partials_handled;
-	u8 stats_idx;
+	u16 stats_idx;
 	bool check_crc:1;
 	bool stats_idx_valid:1;
 };
diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h
index 527c6da2c1ac..4ae77cdde9dc 100644
--- a/drivers/infiniband/hw/irdma/type.h
+++ b/drivers/infiniband/hw/irdma/type.h
@@ -8,6 +8,8 @@
 #include "hmc.h"
 #include "uda.h"
 #include "ws.h"
+#include "virtchnl.h"
+
 #define IRDMA_DEBUG_ERR		"ERR"
 #define IRDMA_DEBUG_INIT	"INIT"
 #define IRDMA_DEBUG_DEV		"DEV"
@@ -95,12 +97,6 @@ enum irdma_term_mpa_errors {
 	MPA_REQ_RSP = 0x04,
 };
 
-enum irdma_qp_event_type {
-	IRDMA_QP_EVENT_CATASTROPHIC,
-	IRDMA_QP_EVENT_ACCESS_ERR,
-	IRDMA_QP_EVENT_REQ_ERR,
-};
-
 enum irdma_hw_stats_index {
 	/* gen1 - 32-bit */
 	IRDMA_HW_STAT_INDEX_IP4RXDISCARD	= 0,
@@ -154,12 +150,46 @@ enum irdma_hw_stats_index {
 	IRDMA_HW_STAT_INDEX_RXRPCNPIGNORED      = 44,
 	IRDMA_HW_STAT_INDEX_TXNPCNPSENT         = 45,
 	IRDMA_HW_STAT_INDEX_MAX_GEN_2		= 46,
+
+	/* gen3 */
+	IRDMA_HW_STAT_INDEX_RNR_SENT		= 46,
+	IRDMA_HW_STAT_INDEX_RNR_RCVD		= 47,
+	IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT	= 48,
+	IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT	= 49,
+	IRDMA_HW_STAT_INDEX_RDMARXATS		= 50,
+	IRDMA_HW_STAT_INDEX_RDMATXATS		= 51,
+	IRDMA_HW_STAT_INDEX_NAKSEQERR		= 52,
+	IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED	= 53,
+	IRDMA_HW_STAT_INDEX_RTO			= 54,
+	IRDMA_HW_STAT_INDEX_RXOOOPKTS		= 55,
+	IRDMA_HW_STAT_INDEX_ICRCERR		= 56,
+
+	IRDMA_HW_STAT_INDEX_MAX_GEN_3		= 57,
 };
 
 enum irdma_feature_type {
 	IRDMA_FEATURE_FW_INFO = 0,
 	IRDMA_HW_VERSION_INFO = 1,
+	IRDMA_QP_MAX_INCR     = 2,
+	IRDMA_CQ_MAX_INCR     = 3,
+	IRDMA_CEQ_MAX_INCR    = 4,
+	IRDMA_SD_MAX_INCR     = 5,
+	IRDMA_MR_MAX_INCR     = 6,
+	IRDMA_Q1_MAX_INCR     = 7,
+	IRDMA_AH_MAX_INCR     = 8,
+	IRDMA_SRQ_MAX_INCR    = 9,
+	IRDMA_TIMER_MAX_INCR  = 10,
+	IRDMA_XF_MAX_INCR     = 11,
+	IRDMA_RRF_MAX_INCR    = 12,
+	IRDMA_PBLE_MAX_INCR   = 13,
+	IRDMA_OBJ_1           = 22,
+	IRDMA_OBJ_2           = 23,
+	IRDMA_ENDPT_TRK       = 24,
+	IRDMA_FTN_INLINE_MAX  = 25,
 	IRDMA_QSETS_MAX       = 26,
+	IRDMA_ASO	      = 27,
+	IRDMA_FTN_FLAGS	      = 32,
+	IRDMA_FTN_NOP         = 33,
 	IRDMA_MAX_FEATURES, /* Must be last entry */
 };
 
@@ -206,6 +236,7 @@ enum irdma_syn_rst_handling {
 enum irdma_queue_type {
 	IRDMA_QUEUE_TYPE_SQ_RQ = 0,
 	IRDMA_QUEUE_TYPE_CQP,
+	IRDMA_QUEUE_TYPE_SRQ,
 };
 
 struct irdma_sc_dev;
@@ -233,12 +264,22 @@ struct irdma_cqp_init_info {
 	__le64 *host_ctx;
 	u64 *scratch_array;
 	u32 sq_size;
+	struct irdma_ooo_cqp_op *ooo_op_array;
+	u32 pe_en_vf_cnt;
 	u16 hw_maj_ver;
 	u16 hw_min_ver;
 	u8 struct_ver;
 	u8 hmc_profile;
 	u8 ena_vf_count;
 	u8 ceqs_per_vf;
+	u8 ooisc_blksize;
+	u8 rrsp_blksize;
+	u8 q1_blksize;
+	u8 xmit_blksize;
+	u8 ts_override;
+	u8 ts_shift;
+	u8 en_fine_grained_timers;
+	u8 blksizes_valid;
 	bool en_datacenter_tcp:1;
 	bool disable_packed:1;
 	bool rocev2_rto_policy:1;
@@ -310,9 +351,21 @@ struct irdma_vsi_pestat {
 	spinlock_t lock; /* rdma stats lock */
 };
 
+struct irdma_mmio_region {
+	u8 __iomem *addr;
+	resource_size_t len;
+	resource_size_t offset;
+};
+
 struct irdma_hw {
-	u8 __iomem *hw_addr;
-	u8 __iomem *priv_hw_addr;
+	union {
+		u8 __iomem *hw_addr;
+		struct {
+			struct irdma_mmio_region rdma_reg; /* RDMA region */
+			struct irdma_mmio_region *io_regs; /* Non-RDMA MMIO regions */
+			u16 num_io_regions; /* Number of Non-RDMA MMIO regions */
+		};
+	};
 	struct device *device;
 	struct irdma_hmc_info hmc;
 };
@@ -351,7 +404,21 @@ struct irdma_cqp_quanta {
 	__le64 elem[IRDMA_CQP_WQE_SIZE];
 };
 
+struct irdma_ooo_cqp_op {
+	struct list_head list_entry;
+	u64 scratch;
+	u32 def_info;
+	u32 sw_def_info;
+	u32 wqe_idx;
+	bool deferred:1;
+};
+
 struct irdma_sc_cqp {
+	spinlock_t ooo_list_lock; /* protects list of pending completions */
+	struct list_head ooo_avail;
+	struct list_head ooo_pnd;
+	u32 last_def_cmpl_ticket;
+	u32 sw_def_cmpl_ticket;
 	u32 size;
 	u64 sq_pa;
 	u64 host_ctx_pa;
@@ -367,8 +434,10 @@ struct irdma_sc_cqp {
 	u64 *scratch_array;
 	u64 requested_ops;
 	atomic64_t completed_ops;
+	struct irdma_ooo_cqp_op *ooo_op_array;
 	u32 cqp_id;
 	u32 sq_size;
+	u32 pe_en_vf_cnt;
 	u32 hw_sq_size;
 	u16 hw_maj_ver;
 	u16 hw_min_ver;
@@ -378,6 +447,14 @@ struct irdma_sc_cqp {
 	u8 ena_vf_count;
 	u8 timeout_count;
 	u8 ceqs_per_vf;
+	u8 ooisc_blksize;
+	u8 rrsp_blksize;
+	u8 q1_blksize;
+	u8 xmit_blksize;
+	u8 ts_override;
+	u8 ts_shift;
+	u8 en_fine_grained_timers;
+	u8 blksizes_valid;
 	bool en_datacenter_tcp:1;
 	bool disable_packed:1;
 	bool rocev2_rto_policy:1;
@@ -397,6 +474,8 @@ struct irdma_sc_aeq {
 	u32 msix_idx;
 	u8 polarity;
 	bool virtual_map:1;
+	bool pasid_valid:1;
+	u32 pasid;
 };
 
 struct irdma_sc_ceq {
@@ -412,13 +491,15 @@ struct irdma_sc_ceq {
 	u8 tph_val;
 	u32 first_pm_pbl_idx;
 	u8 polarity;
-	struct irdma_sc_vsi *vsi;
+	u16 vsi_idx;
 	struct irdma_sc_cq **reg_cq;
 	u32 reg_cq_size;
 	spinlock_t req_cq_lock; /* protect access to reg_cq array */
 	bool virtual_map:1;
 	bool tph_en:1;
 	bool itr_no_expire:1;
+	bool pasid_valid:1;
+	u32 pasid;
 };
 
 struct irdma_sc_cq {
@@ -426,6 +507,7 @@ struct irdma_sc_cq {
 	u64 cq_pa;
 	u64 shadow_area_pa;
 	struct irdma_sc_dev *dev;
+	u16 vsi_idx;
 	struct irdma_sc_vsi *vsi;
 	void *pbl_list;
 	void *back_cq;
@@ -477,8 +559,13 @@ struct irdma_sc_qp {
 	bool virtual_map:1;
 	bool flush_sq:1;
 	bool flush_rq:1;
+	bool err_sq_idx_valid:1;
+	bool err_rq_idx_valid:1;
+	u32 err_sq_idx;
+	u32 err_rq_idx;
 	bool sq_flush_code:1;
 	bool rq_flush_code:1;
+	u32 pkt_limit;
 	enum irdma_flush_opcode flush_code;
 	enum irdma_qp_event_type event_type;
 	u8 term_flags;
@@ -489,13 +576,13 @@ struct irdma_sc_qp {
 struct irdma_stats_inst_info {
 	bool use_hmc_fcn_index;
 	u8 hmc_fn_id;
-	u8 stats_idx;
+	u16 stats_idx;
 };
 
 struct irdma_up_info {
 	u8 map[8];
 	u8 cnp_up_override;
-	u8 hmc_fcn_idx;
+	u16 hmc_fcn_idx;
 	bool use_vlan:1;
 	bool use_cnp_up_override:1;
 };
@@ -518,6 +605,8 @@ struct irdma_ws_node_info {
 struct irdma_hmc_fpm_misc {
 	u32 max_ceqs;
 	u32 max_sds;
+	u32 loc_mem_pages;
+	u8 ird;
 	u32 xf_block_size;
 	u32 q1_block_size;
 	u32 ht_multiplier;
@@ -526,6 +615,7 @@ struct irdma_hmc_fpm_misc {
 	u32 ooiscf_block_size;
 };
 
+#define IRDMA_VCHNL_MAX_MSG_SIZE 512
 #define IRDMA_LEAF_DEFAULT_REL_BW		64
 #define IRDMA_PARENT_DEFAULT_REL_BW		1
 
@@ -601,19 +691,28 @@ struct irdma_sc_dev {
 	u64 cqp_cmd_stats[IRDMA_MAX_CQP_OPS];
 	struct irdma_hw_attrs hw_attrs;
 	struct irdma_hmc_info *hmc_info;
+	struct irdma_vchnl_rdma_caps vc_caps;
+	u8 vc_recv_buf[IRDMA_VCHNL_MAX_MSG_SIZE];
+	u16 vc_recv_len;
 	struct irdma_sc_cqp *cqp;
 	struct irdma_sc_aeq *aeq;
 	struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT];
 	struct irdma_sc_cq *ccq;
 	const struct irdma_irq_ops *irq_ops;
+	struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY];
 	struct irdma_hmc_fpm_misc hmc_fpm_misc;
 	struct irdma_ws_node *ws_tree_root;
 	struct mutex ws_mutex; /* ws tree mutex */
+	u32 vchnl_ver;
 	u16 num_vfs;
-	u8 hmc_fn_id;
+	u16 hmc_fn_id;
 	u8 vf_id;
+	bool privileged:1;
 	bool vchnl_up:1;
 	bool ceq_valid:1;
+	bool is_pf:1;
+	u8 protocol_used;
+	struct mutex vchnl_mutex; /* mutex to synchronize RDMA virtual channel messages */
 	u8 pci_rev;
 	int (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri);
 	void (*ws_remove)(struct irdma_sc_vsi *vsi, u8 user_pri);
@@ -632,6 +731,51 @@ struct irdma_modify_cq_info {
 	bool cq_resize:1;
 };
 
+struct irdma_srq_init_info {
+	struct irdma_sc_pd *pd;
+	struct irdma_sc_vsi *vsi;
+	u64 srq_pa;
+	u64 shadow_area_pa;
+	u32 first_pm_pbl_idx;
+	u32 pasid;
+	u32 srq_size;
+	u16 srq_limit;
+	u8 pasid_valid;
+	u8 wqe_size;
+	u8 leaf_pbl_size;
+	u8 virtual_map;
+	u8 tph_en;
+	u8 arm_limit_event;
+	u8 tph_value;
+	u8 pbl_chunk_size;
+	struct irdma_srq_uk_init_info srq_uk_init_info;
+};
+
+struct irdma_sc_srq {
+	struct irdma_sc_dev *dev;
+	struct irdma_sc_vsi *vsi;
+	struct irdma_sc_pd *pd;
+	struct irdma_srq_uk srq_uk;
+	void *back_srq;
+	u64 srq_pa;
+	u64 shadow_area_pa;
+	u32 first_pm_pbl_idx;
+	u32 pasid;
+	u32 hw_srq_size;
+	u16 srq_limit;
+	u8 pasid_valid;
+	u8 leaf_pbl_size;
+	u8 virtual_map;
+	u8 tph_en;
+	u8 arm_limit_event;
+	u8 tph_val;
+};
+
+struct irdma_modify_srq_info {
+	u16 srq_limit;
+	u8 arm_limit_event;
+};
+
 struct irdma_create_qp_info {
 	bool ord_valid:1;
 	bool tcp_ctx_valid:1;
@@ -671,7 +815,8 @@ struct irdma_ccq_cqe_info {
 	u16 maj_err_code;
 	u16 min_err_code;
 	u8 op_code;
-	bool error;
+	bool error:1;
+	bool pending:1;
 };
 
 struct irdma_dcb_app_info {
@@ -720,7 +865,7 @@ struct irdma_vsi_init_info {
 
 struct irdma_vsi_stats_info {
 	struct irdma_vsi_pestat *pestat;
-	u8 fcn_id;
+	u16 fcn_id;
 	bool alloc_stats_inst;
 };
 
@@ -731,7 +876,8 @@ struct irdma_device_init_info {
 	__le64 *fpm_commit_buf;
 	struct irdma_hw *hw;
 	void __iomem *bar0;
-	u8 hmc_fn_id;
+	enum irdma_protocol_used protocol_used;
+	u16 hmc_fn_id;
 };
 
 struct irdma_ceq_init_info {
@@ -746,8 +892,8 @@ struct irdma_ceq_init_info {
 	bool itr_no_expire:1;
 	u8 pbl_chunk_size;
 	u8 tph_val;
+	u16 vsi_idx;
 	u32 first_pm_pbl_idx;
-	struct irdma_sc_vsi *vsi;
 	struct irdma_sc_cq **reg_cq;
 	u32 reg_cq_idx;
 };
@@ -807,6 +953,8 @@ struct irdma_udp_offload_info {
 	u32 cwnd;
 	u8 rexmit_thresh;
 	u8 rnr_nak_thresh;
+	u8 rnr_nak_tmr;
+	u8 min_rnr_timer;
 };
 
 struct irdma_roce_offload_info {
@@ -833,6 +981,7 @@ struct irdma_roce_offload_info {
 	bool dctcp_en:1;
 	bool fw_cc_enable:1;
 	bool use_stats_inst:1;
+	u8 local_ack_timeout;
 	u16 t_high;
 	u16 t_low;
 	u8 last_byte_sent;
@@ -933,8 +1082,10 @@ struct irdma_qp_host_ctx_info {
 	};
 	u32 send_cq_num;
 	u32 rcv_cq_num;
+	u32 srq_id;
 	u32 rem_endpoint_idx;
-	u8 stats_idx;
+	u16 stats_idx;
+	bool remote_atomics_en:1;
 	bool srq_valid:1;
 	bool tcp_info_valid:1;
 	bool iwarp_info_valid:1;
@@ -945,6 +1096,7 @@ struct irdma_qp_host_ctx_info {
 struct irdma_aeqe_info {
 	u64 compl_ctx;
 	u32 qp_cq_id;
+	u32 def_info;	/* only valid for DEF_CMPL */
 	u16 ae_id;
 	u16 wqe_idx;
 	u8 tcp_state;
@@ -953,9 +1105,11 @@ struct irdma_aeqe_info {
 	bool cq:1;
 	bool sq:1;
 	bool rq:1;
+	bool srq:1;
 	bool in_rdrsp_wr:1;
 	bool out_rdrsp:1;
 	bool aeqe_overflow:1;
+	bool err_rq_idx_valid:1;
 	u8 q2_data_written;
 	u8 ae_src;
 };
@@ -972,7 +1126,8 @@ struct irdma_allocate_stag_info {
 	bool use_hmc_fcn_index:1;
 	bool use_pf_rid:1;
 	bool all_memory:1;
-	u8 hmc_fcn_index;
+	bool remote_atomics_en:1;
+	u16 hmc_fcn_index;
 };
 
 struct irdma_mw_alloc_info {
@@ -1000,6 +1155,7 @@ struct irdma_reg_ns_stag_info {
 	u8 hmc_fcn_index;
 	bool use_pf_rid:1;
 	bool all_memory:1;
+	bool remote_atomics_en:1;
 };
 
 struct irdma_fast_reg_stag_info {
@@ -1023,6 +1179,7 @@ struct irdma_fast_reg_stag_info {
 	u8 hmc_fcn_index;
 	bool use_pf_rid:1;
 	bool defer_flag:1;
+	bool remote_atomics_en:1;
 };
 
 struct irdma_dealloc_stag_info {
@@ -1130,6 +1287,8 @@ struct irdma_cqp_manage_push_page_info {
 };
 
 struct irdma_qp_flush_info {
+	u32 err_sq_idx;
+	u32 err_rq_idx;
 	u16 sq_minor_code;
 	u16 sq_major_code;
 	u16 rq_minor_code;
@@ -1140,6 +1299,8 @@ struct irdma_qp_flush_info {
 	bool rq:1;
 	bool userflushcode:1;
 	bool generate_ae:1;
+	bool err_sq_idx_valid:1;
+	bool err_rq_idx_valid:1;
 };
 
 struct irdma_gen_ae_info {
@@ -1189,6 +1350,11 @@ void irdma_sc_pd_init(struct irdma_sc_dev *dev, struct irdma_sc_pd *pd, u32 pd_i
 void irdma_cfg_aeq(struct irdma_sc_dev *dev, u32 idx, bool enable);
 void irdma_check_cqp_progress(struct irdma_cqp_timeout *cqp_timeout,
 			      struct irdma_sc_dev *dev);
+void irdma_sc_cqp_def_cmpl_ae_handler(struct irdma_sc_dev *dev,
+				      struct irdma_aeqe_info *info,
+				      bool first, u64 *scratch,
+				      u32 *sw_def_info);
+u64 irdma_sc_cqp_cleanup_handler(struct irdma_sc_dev *dev);
 int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err);
 int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp);
 int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp,
@@ -1224,6 +1390,8 @@ void irdma_sc_cq_resize(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *inf
 int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch,
 					u8 hmc_fn_id, bool post_sq,
 					bool poll_registers);
+int irdma_sc_srq_init(struct irdma_sc_srq *srq,
+		      struct irdma_srq_init_info *info);
 
 void sc_vsi_update_stats(struct irdma_sc_vsi *vsi);
 struct cqp_info {
@@ -1467,6 +1635,23 @@ struct cqp_info {
 			struct irdma_dma_mem query_buff_mem;
 			u64 scratch;
 		} query_rdma;
+
+		struct {
+			struct irdma_sc_srq *srq;
+			u64 scratch;
+		} srq_create;
+
+		struct {
+			struct irdma_sc_srq *srq;
+			struct irdma_modify_srq_info info;
+			u64 scratch;
+		} srq_modify;
+
+		struct {
+			struct irdma_sc_srq *srq;
+			u64 scratch;
+		} srq_destroy;
+
 	} u;
 };
 
diff --git a/drivers/infiniband/hw/irdma/uda_d.h b/drivers/infiniband/hw/irdma/uda_d.h
index 5a9e6eabf032..4fb4daa20722 100644
--- a/drivers/infiniband/hw/irdma/uda_d.h
+++ b/drivers/infiniband/hw/irdma/uda_d.h
@@ -78,8 +78,7 @@
 #define IRDMA_UDAQPC_IPID GENMASK_ULL(47, 32)
 #define IRDMA_UDAQPC_SNDMSS GENMASK_ULL(29, 16)
 #define IRDMA_UDAQPC_VLANTAG GENMASK_ULL(15, 0)
-
-#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI GENMASK_ULL(21, 20)
+#define IRDMA_UDA_CQPSQ_MAV_PDINDEXHI GENMASK_ULL(27, 20)
 #define IRDMA_UDA_CQPSQ_MAV_PDINDEXLO GENMASK_ULL(63, 48)
 #define IRDMA_UDA_CQPSQ_MAV_SRCMACADDRINDEX GENMASK_ULL(29, 24)
 #define IRDMA_UDA_CQPSQ_MAV_ARPINDEX GENMASK_ULL(63, 48)
@@ -94,7 +93,7 @@
 #define IRDMA_UDA_CQPSQ_MAV_OPCODE GENMASK_ULL(37, 32)
 #define IRDMA_UDA_CQPSQ_MAV_DOLOOPBACKK BIT_ULL(62)
 #define IRDMA_UDA_CQPSQ_MAV_IPV4VALID BIT_ULL(59)
-#define IRDMA_UDA_CQPSQ_MAV_AVIDX GENMASK_ULL(16, 0)
+#define IRDMA_UDA_CQPSQ_MAV_AVIDX GENMASK_ULL(23, 0)
 #define IRDMA_UDA_CQPSQ_MAV_INSERTVLANTAG BIT_ULL(60)
 #define IRDMA_UDA_MGCTX_VFFLAG BIT_ULL(29)
 #define IRDMA_UDA_MGCTX_DESTPORT GENMASK_ULL(47, 32)
diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c
index 38c54e59cc2e..ce1ae10c30fc 100644
--- a/drivers/infiniband/hw/irdma/uk.c
+++ b/drivers/infiniband/hw/irdma/uk.c
@@ -198,6 +198,26 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx,
 	return wqe;
 }
 
+__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx)
+{
+	int ret_code;
+	__le64 *wqe;
+
+	if (IRDMA_RING_FULL_ERR(srq->srq_ring))
+		return NULL;
+
+	IRDMA_ATOMIC_RING_MOVE_HEAD(srq->srq_ring, *wqe_idx, ret_code);
+	if (ret_code)
+		return NULL;
+
+	if (!*wqe_idx)
+		srq->srwqe_polarity = !srq->srwqe_polarity;
+	/* rq_wqe_size_multiplier is no of 32 byte quanta in one rq wqe */
+	wqe = srq->srq_base[*wqe_idx * (srq->wqe_size_multiplier)].elem;
+
+	return wqe;
+}
+
 /**
  * irdma_qp_get_next_recv_wqe - get next qp's rcv wqe
  * @qp: hw qp ptr
@@ -318,6 +338,160 @@ int irdma_uk_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info,
 }
 
 /**
+ * irdma_uk_atomic_fetch_add - atomic fetch and add operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+int irdma_uk_atomic_fetch_add(struct irdma_qp_uk *qp,
+			      struct irdma_post_sq_info *info, bool post_sq)
+{
+	struct irdma_atomic_fetch_add *op_info;
+	u32 total_size = 0;
+	u16 quanta = 2;
+	u32 wqe_idx;
+	__le64 *wqe;
+	u64 hdr;
+
+	op_info = &info->op.atomic_fetch_add;
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size,
+					 info);
+	if (!wqe)
+		return -ENOMEM;
+
+	set_64bit_val(wqe, 0, op_info->tagged_offset);
+	set_64bit_val(wqe, 8,
+		      FIELD_PREP(IRDMAQPSQ_STAG, op_info->stag));
+	set_64bit_val(wqe, 16, op_info->remote_tagged_offset);
+
+	hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, 1) |
+	      FIELD_PREP(IRDMAQPSQ_REMOTE_STAG, op_info->remote_stag) |
+	      FIELD_PREP(IRDMAQPSQ_OPCODE, IRDMAQP_OP_ATOMIC_FETCH_ADD) |
+	      FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) |
+	      FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) |
+	      FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) |
+	      FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity);
+
+	set_64bit_val(wqe, 32, op_info->fetch_add_data_bytes);
+	set_64bit_val(wqe, 40, 0);
+	set_64bit_val(wqe, 48, 0);
+	set_64bit_val(wqe, 56,
+		      FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity));
+
+	dma_wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (post_sq)
+		irdma_uk_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * irdma_uk_atomic_compare_swap - atomic compare and swap operation
+ * @qp: hw qp ptr
+ * @info: post sq information
+ * @post_sq: flag to post sq
+ */
+int irdma_uk_atomic_compare_swap(struct irdma_qp_uk *qp,
+				 struct irdma_post_sq_info *info, bool post_sq)
+{
+	struct irdma_atomic_compare_swap *op_info;
+	u32 total_size = 0;
+	u16 quanta = 2;
+	u32 wqe_idx;
+	__le64 *wqe;
+	u64 hdr;
+
+	op_info = &info->op.atomic_compare_swap;
+	wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size,
+					 info);
+	if (!wqe)
+		return -ENOMEM;
+
+	set_64bit_val(wqe, 0, op_info->tagged_offset);
+	set_64bit_val(wqe, 8,
+		      FIELD_PREP(IRDMAQPSQ_STAG, op_info->stag));
+	set_64bit_val(wqe, 16, op_info->remote_tagged_offset);
+
+	hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, 1) |
+	      FIELD_PREP(IRDMAQPSQ_REMOTE_STAG, op_info->remote_stag) |
+	      FIELD_PREP(IRDMAQPSQ_OPCODE, IRDMAQP_OP_ATOMIC_COMPARE_SWAP_ADD) |
+	      FIELD_PREP(IRDMAQPSQ_READFENCE, info->read_fence) |
+	      FIELD_PREP(IRDMAQPSQ_LOCALFENCE, info->local_fence) |
+	      FIELD_PREP(IRDMAQPSQ_SIGCOMPL, info->signaled) |
+	      FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity);
+
+	set_64bit_val(wqe, 32, op_info->swap_data_bytes);
+	set_64bit_val(wqe, 40, op_info->compare_data_bytes);
+	set_64bit_val(wqe, 48, 0);
+	set_64bit_val(wqe, 56,
+		      FIELD_PREP(IRDMAQPSQ_VALID, qp->swqe_polarity));
+
+	dma_wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	if (post_sq)
+		irdma_uk_qp_post_wr(qp);
+
+	return 0;
+}
+
+/**
+ * irdma_uk_srq_post_receive - post a receive wqe to a shared rq
+ * @srq: shared rq ptr
+ * @info: post rq information
+ */
+int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq,
+			      struct irdma_post_rq_info *info)
+{
+	u32 wqe_idx, i, byte_off;
+	u32 addl_frag_cnt;
+	__le64 *wqe;
+	u64 hdr;
+
+	if (srq->max_srq_frag_cnt < info->num_sges)
+		return -EINVAL;
+
+	wqe = irdma_srq_get_next_recv_wqe(srq, &wqe_idx);
+	if (!wqe)
+		return -ENOMEM;
+
+	addl_frag_cnt = info->num_sges > 1 ? info->num_sges - 1 : 0;
+	srq->wqe_ops.iw_set_fragment(wqe, 0, info->sg_list,
+				     srq->srwqe_polarity);
+
+	for (i = 1, byte_off = 32; i < info->num_sges; i++) {
+		srq->wqe_ops.iw_set_fragment(wqe, byte_off, &info->sg_list[i],
+					     srq->srwqe_polarity);
+		byte_off += 16;
+	}
+
+	/* if not an odd number set valid bit in next fragment */
+	if (srq->uk_attrs->hw_rev >= IRDMA_GEN_2 && !(info->num_sges & 0x01) &&
+	    info->num_sges) {
+		srq->wqe_ops.iw_set_fragment(wqe, byte_off, NULL,
+					     srq->srwqe_polarity);
+		if (srq->uk_attrs->hw_rev == IRDMA_GEN_2)
+			++addl_frag_cnt;
+	}
+
+	set_64bit_val(wqe, 16, (u64)info->wr_id);
+	hdr = FIELD_PREP(IRDMAQPSQ_ADDFRAGCNT, addl_frag_cnt) |
+	      FIELD_PREP(IRDMAQPSQ_VALID, srq->srwqe_polarity);
+
+	dma_wmb(); /* make sure WQE is populated before valid bit is set */
+
+	set_64bit_val(wqe, 24, hdr);
+
+	set_64bit_val(srq->shadow_area, 0, (wqe_idx + 1) % srq->srq_ring.size);
+
+	return 0;
+}
+
+/**
  * irdma_uk_rdma_read - rdma read command
  * @qp: hw qp ptr
  * @info: post sq information
@@ -973,6 +1147,9 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
 	u64 comp_ctx, qword0, qword2, qword3;
 	__le64 *cqe;
 	struct irdma_qp_uk *qp;
+	struct irdma_srq_uk *srq;
+	struct qp_err_code qp_err;
+	u8 is_srq;
 	struct irdma_ring *pring = NULL;
 	u32 wqe_idx;
 	int ret_code;
@@ -1046,21 +1223,46 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
 	}
 
 	info->q_type = (u8)FIELD_GET(IRDMA_CQ_SQ, qword3);
+	is_srq = (u8)FIELD_GET(IRDMA_CQ_SRQ, qword3);
 	info->error = (bool)FIELD_GET(IRDMA_CQ_ERROR, qword3);
 	info->ipv4 = (bool)FIELD_GET(IRDMACQ_IPV4, qword3);
+	get_64bit_val(cqe, 8, &comp_ctx);
+	if (is_srq)
+		get_64bit_val(cqe, 40, (u64 *)&qp);
+	else
+		qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx;
 	if (info->error) {
 		info->major_err = FIELD_GET(IRDMA_CQ_MAJERR, qword3);
 		info->minor_err = FIELD_GET(IRDMA_CQ_MINERR, qword3);
-		if (info->major_err == IRDMA_FLUSH_MAJOR_ERR) {
-			info->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
+		switch (info->major_err) {
+		case IRDMA_SRQFLUSH_RSVD_MAJOR_ERR:
+			qp_err = irdma_ae_to_qp_err_code(info->minor_err);
+			info->minor_err = qp_err.flush_code;
+			fallthrough;
+		case IRDMA_FLUSH_MAJOR_ERR:
 			/* Set the min error to standard flush error code for remaining cqes */
 			if (info->minor_err != FLUSH_GENERAL_ERR) {
 				qword3 &= ~IRDMA_CQ_MINERR;
 				qword3 |= FIELD_PREP(IRDMA_CQ_MINERR, FLUSH_GENERAL_ERR);
 				set_64bit_val(cqe, 24, qword3);
 			}
-		} else {
-			info->comp_status = IRDMA_COMPL_STATUS_UNKNOWN;
+			info->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
+			break;
+		default:
+#define IRDMA_CIE_SIGNATURE 0xE
+#define IRDMA_CQMAJERR_HIGH_NIBBLE GENMASK(15, 12)
+			if (info->q_type == IRDMA_CQE_QTYPE_SQ &&
+			    qp->qp_type == IRDMA_QP_TYPE_ROCE_UD &&
+			    FIELD_GET(IRDMA_CQMAJERR_HIGH_NIBBLE, info->major_err)
+			    == IRDMA_CIE_SIGNATURE) {
+				info->error = 0;
+				info->major_err = 0;
+				info->minor_err = 0;
+				info->comp_status = IRDMA_COMPL_STATUS_SUCCESS;
+			} else {
+				info->comp_status = IRDMA_COMPL_STATUS_UNKNOWN;
+			}
+			break;
 		}
 	} else {
 		info->comp_status = IRDMA_COMPL_STATUS_SUCCESS;
@@ -1069,7 +1271,6 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
 	get_64bit_val(cqe, 0, &qword0);
 	get_64bit_val(cqe, 16, &qword2);
 
-	info->tcp_seq_num_rtt = (u32)FIELD_GET(IRDMACQ_TCPSEQNUMRTT, qword0);
 	info->qp_id = (u32)FIELD_GET(IRDMACQ_QPID, qword2);
 	info->ud_src_qpn = (u32)FIELD_GET(IRDMACQ_UDSRCQPN, qword2);
 
@@ -1085,7 +1286,22 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
 	info->qp_handle = (irdma_qp_handle)(unsigned long)qp;
 	info->op_type = (u8)FIELD_GET(IRDMACQ_OP, qword3);
 
-	if (info->q_type == IRDMA_CQE_QTYPE_RQ) {
+	if (info->q_type == IRDMA_CQE_QTYPE_RQ && is_srq) {
+		srq = qp->srq_uk;
+
+		get_64bit_val(cqe, 8, &info->wr_id);
+		info->bytes_xfered = (u32)FIELD_GET(IRDMACQ_PAYLDLEN, qword0);
+
+		if (qword3 & IRDMACQ_STAG) {
+			info->stag_invalid_set = true;
+			info->inv_stag = (u32)FIELD_GET(IRDMACQ_INVSTAG,
+							qword2);
+		} else {
+			info->stag_invalid_set = false;
+		}
+		IRDMA_RING_MOVE_TAIL(srq->srq_ring);
+		pring = &srq->srq_ring;
+	} else if (info->q_type == IRDMA_CQE_QTYPE_RQ && !is_srq) {
 		u32 array_idx;
 
 		array_idx = wqe_idx / qp->rq_wqe_size_multiplier;
@@ -1180,9 +1396,15 @@ int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq,
 	ret_code = 0;
 
 exit:
-	if (!ret_code && info->comp_status == IRDMA_COMPL_STATUS_FLUSHED)
+	if (!ret_code && info->comp_status == IRDMA_COMPL_STATUS_FLUSHED) {
 		if (pring && IRDMA_RING_MORE_WORK(*pring))
-			move_cq_head = false;
+		/* Park CQ head during a flush to generate additional CQEs
+		 * from SW for all unprocessed WQEs. For GEN3 and beyond
+		 * FW will generate/flush these CQEs so move to the next CQE
+		 */
+			move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ?
+						false : true;
+	}
 
 	if (move_cq_head) {
 		IRDMA_RING_MOVE_HEAD_NOCHECK(cq->cq_ring);
@@ -1210,10 +1432,10 @@ exit:
 }
 
 /**
- * irdma_qp_round_up - return round up qp wq depth
+ * irdma_round_up_wq - return round up qp wq depth
  * @wqdepth: wq depth in quanta to round up
  */
-static int irdma_qp_round_up(u32 wqdepth)
+static int irdma_round_up_wq(u32 wqdepth)
 {
 	int scount = 1;
 
@@ -1268,7 +1490,7 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift,
 {
 	u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift;
 
-	*sqdepth = irdma_qp_round_up((sq_size << shift) + IRDMA_SQ_RSVD);
+	*sqdepth = irdma_round_up_wq((sq_size << shift) + IRDMA_SQ_RSVD);
 
 	if (*sqdepth < min_size)
 		*sqdepth = min_size;
@@ -1290,7 +1512,7 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift,
 {
 	u32 min_size = (u32)uk_attrs->min_hw_wq_size << shift;
 
-	*rqdepth = irdma_qp_round_up((rq_size << shift) + IRDMA_RQ_RSVD);
+	*rqdepth = irdma_round_up_wq((rq_size << shift) + IRDMA_RQ_RSVD);
 
 	if (*rqdepth < min_size)
 		*rqdepth = min_size;
@@ -1300,6 +1522,26 @@ int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift,
 	return 0;
 }
 
+/*
+ * irdma_get_srqdepth - get SRQ depth (quanta)
+ * @uk_attrs: qp HW attributes
+ * @srq_size: SRQ size
+ * @shift: shift which determines size of WQE
+ * @srqdepth: depth of SRQ
+ */
+int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift,
+		       u32 *srqdepth)
+{
+	*srqdepth = irdma_round_up_wq((srq_size << shift) + IRDMA_RQ_RSVD);
+
+	if (*srqdepth < ((u32)uk_attrs->min_hw_wq_size << shift))
+		*srqdepth = uk_attrs->min_hw_wq_size << shift;
+	else if (*srqdepth > uk_attrs->max_hw_srq_quanta)
+		return -EINVAL;
+
+	return 0;
+}
+
 static const struct irdma_wqe_uk_ops iw_wqe_uk_ops = {
 	.iw_copy_inline_data = irdma_copy_inline_data,
 	.iw_inline_data_size_to_quanta = irdma_inline_data_size_to_quanta,
@@ -1336,6 +1578,42 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp,
 }
 
 /**
+ * irdma_uk_srq_init - initialize shared qp
+ * @srq: hw srq (user and kernel)
+ * @info: srq initialization info
+ *
+ * Initializes the vars used in both user and kernel mode.
+ * The size of the wqe depends on number of max fragments
+ * allowed. Then size of wqe * the number of wqes should be the
+ * amount of memory allocated for srq.
+ */
+int irdma_uk_srq_init(struct irdma_srq_uk *srq,
+		      struct irdma_srq_uk_init_info *info)
+{
+	u8 rqshift;
+
+	srq->uk_attrs = info->uk_attrs;
+	if (info->max_srq_frag_cnt > srq->uk_attrs->max_hw_wq_frags)
+		return -EINVAL;
+
+	irdma_get_wqe_shift(srq->uk_attrs, info->max_srq_frag_cnt, 0, &rqshift);
+	srq->srq_caps = info->srq_caps;
+	srq->srq_base = info->srq;
+	srq->shadow_area = info->shadow_area;
+	srq->srq_id = info->srq_id;
+	srq->srwqe_polarity = 0;
+	srq->srq_size = info->srq_size;
+	srq->wqe_size = rqshift;
+	srq->max_srq_frag_cnt = min(srq->uk_attrs->max_hw_wq_frags,
+				    ((u32)2 << rqshift) - 1);
+	IRDMA_RING_INIT(srq->srq_ring, srq->srq_size);
+	srq->wqe_size_multiplier = 1 << rqshift;
+	srq->wqe_ops = iw_wqe_uk_ops;
+
+	return 0;
+}
+
+/**
  * irdma_uk_calc_shift_wq - calculate WQE shift for both SQ and RQ
  * @ukinfo: qp initialization info
  * @sq_shift: Returns shift of SQ
@@ -1461,6 +1739,7 @@ int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info)
 		qp->wqe_ops = iw_wqe_uk_ops_gen_1;
 	else
 		qp->wqe_ops = iw_wqe_uk_ops;
+	qp->srq_uk = info->srq_uk;
 	return ret_code;
 }
 
diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h
index 380e4a47aede..ab57f689827a 100644
--- a/drivers/infiniband/hw/irdma/user.h
+++ b/drivers/infiniband/hw/irdma/user.h
@@ -41,10 +41,114 @@
 #define IRDMA_OP_TYPE_INV_STAG			0x0a
 #define IRDMA_OP_TYPE_RDMA_READ_INV_STAG	0x0b
 #define IRDMA_OP_TYPE_NOP			0x0c
+#define IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD	0x0f
+#define IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP	0x11
 #define IRDMA_OP_TYPE_REC	0x3e
 #define IRDMA_OP_TYPE_REC_IMM	0x3f
 
-#define IRDMA_FLUSH_MAJOR_ERR	1
+#define IRDMA_FLUSH_MAJOR_ERR 1
+#define IRDMA_SRQFLUSH_RSVD_MAJOR_ERR 0xfffe
+
+/* Async Events codes */
+#define IRDMA_AE_AMP_UNALLOCATED_STAG					0x0102
+#define IRDMA_AE_AMP_INVALID_STAG					0x0103
+#define IRDMA_AE_AMP_BAD_QP						0x0104
+#define IRDMA_AE_AMP_BAD_PD						0x0105
+#define IRDMA_AE_AMP_BAD_STAG_KEY					0x0106
+#define IRDMA_AE_AMP_BAD_STAG_INDEX					0x0107
+#define IRDMA_AE_AMP_BOUNDS_VIOLATION					0x0108
+#define IRDMA_AE_AMP_RIGHTS_VIOLATION					0x0109
+#define IRDMA_AE_AMP_TO_WRAP						0x010a
+#define IRDMA_AE_AMP_FASTREG_VALID_STAG					0x010c
+#define IRDMA_AE_AMP_FASTREG_MW_STAG					0x010d
+#define IRDMA_AE_AMP_FASTREG_INVALID_RIGHTS				0x010e
+#define IRDMA_AE_AMP_FASTREG_INVALID_LENGTH				0x0110
+#define IRDMA_AE_AMP_INVALIDATE_SHARED					0x0111
+#define IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS			0x0112
+#define IRDMA_AE_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS			0x0113
+#define IRDMA_AE_AMP_MWBIND_VALID_STAG					0x0114
+#define IRDMA_AE_AMP_MWBIND_OF_MR_STAG					0x0115
+#define IRDMA_AE_AMP_MWBIND_TO_ZERO_BASED_STAG				0x0116
+#define IRDMA_AE_AMP_MWBIND_TO_MW_STAG					0x0117
+#define IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS				0x0118
+#define IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS				0x0119
+#define IRDMA_AE_AMP_MWBIND_TO_INVALID_PARENT				0x011a
+#define IRDMA_AE_AMP_MWBIND_BIND_DISABLED				0x011b
+#define IRDMA_AE_PRIV_OPERATION_DENIED					0x011c
+#define IRDMA_AE_AMP_INVALIDATE_TYPE1_MW				0x011d
+#define IRDMA_AE_AMP_MWBIND_ZERO_BASED_TYPE1_MW				0x011e
+#define IRDMA_AE_AMP_FASTREG_INVALID_PBL_HPS_CFG			0x011f
+#define IRDMA_AE_AMP_MWBIND_WRONG_TYPE					0x0120
+#define IRDMA_AE_AMP_FASTREG_PBLE_MISMATCH				0x0121
+#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG				0x0132
+#define IRDMA_AE_UDA_XMIT_BAD_PD					0x0133
+#define IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT				0x0134
+#define IRDMA_AE_UDA_L4LEN_INVALID					0x0135
+#define IRDMA_AE_BAD_CLOSE						0x0201
+#define IRDMA_AE_RDMAP_ROE_BAD_LLP_CLOSE				0x0202
+#define IRDMA_AE_CQ_OPERATION_ERROR					0x0203
+#define IRDMA_AE_RDMA_READ_WHILE_ORD_ZERO				0x0205
+#define IRDMA_AE_STAG_ZERO_INVALID					0x0206
+#define IRDMA_AE_IB_RREQ_AND_Q1_FULL					0x0207
+#define IRDMA_AE_IB_INVALID_REQUEST					0x0208
+#define IRDMA_AE_SRQ_LIMIT						0x0209
+#define IRDMA_AE_WQE_UNEXPECTED_OPCODE					0x020a
+#define IRDMA_AE_WQE_INVALID_PARAMETER					0x020b
+#define IRDMA_AE_WQE_INVALID_FRAG_DATA					0x020c
+#define IRDMA_AE_IB_REMOTE_ACCESS_ERROR					0x020d
+#define IRDMA_AE_IB_REMOTE_OP_ERROR					0x020e
+#define IRDMA_AE_SRQ_CATASTROPHIC_ERROR					0x020f
+#define IRDMA_AE_WQE_LSMM_TOO_LONG					0x0220
+#define IRDMA_AE_ATOMIC_ALIGNMENT					0x0221
+#define IRDMA_AE_ATOMIC_MASK						0x0222
+#define IRDMA_AE_INVALID_REQUEST					0x0223
+#define IRDMA_AE_PCIE_ATOMIC_DISABLE					0x0224
+#define IRDMA_AE_DDP_INVALID_MSN_GAP_IN_MSN				0x0301
+#define IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER	0x0303
+#define IRDMA_AE_DDP_UBE_INVALID_DDP_VERSION				0x0304
+#define IRDMA_AE_DDP_UBE_INVALID_MO					0x0305
+#define IRDMA_AE_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE		0x0306
+#define IRDMA_AE_DDP_UBE_INVALID_QN					0x0307
+#define IRDMA_AE_DDP_NO_L_BIT						0x0308
+#define IRDMA_AE_RDMAP_ROE_INVALID_RDMAP_VERSION			0x0311
+#define IRDMA_AE_RDMAP_ROE_UNEXPECTED_OPCODE				0x0312
+#define IRDMA_AE_ROE_INVALID_RDMA_READ_REQUEST				0x0313
+#define IRDMA_AE_ROE_INVALID_RDMA_WRITE_OR_READ_RESP			0x0314
+#define IRDMA_AE_ROCE_RSP_LENGTH_ERROR					0x0316
+#define IRDMA_AE_ROCE_EMPTY_MCG						0x0380
+#define IRDMA_AE_ROCE_BAD_MC_IP_ADDR					0x0381
+#define IRDMA_AE_ROCE_BAD_MC_QPID					0x0382
+#define IRDMA_AE_MCG_QP_PROTOCOL_MISMATCH				0x0383
+#define IRDMA_AE_INVALID_ARP_ENTRY					0x0401
+#define IRDMA_AE_INVALID_TCP_OPTION_RCVD				0x0402
+#define IRDMA_AE_STALE_ARP_ENTRY					0x0403
+#define IRDMA_AE_INVALID_AH_ENTRY					0x0406
+#define IRDMA_AE_LLP_CLOSE_COMPLETE					0x0501
+#define IRDMA_AE_LLP_CONNECTION_RESET					0x0502
+#define IRDMA_AE_LLP_FIN_RECEIVED					0x0503
+#define IRDMA_AE_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH	0x0504
+#define IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR				0x0505
+#define IRDMA_AE_LLP_SEGMENT_TOO_SMALL					0x0507
+#define IRDMA_AE_LLP_SYN_RECEIVED					0x0508
+#define IRDMA_AE_LLP_TERMINATE_RECEIVED					0x0509
+#define IRDMA_AE_LLP_TOO_MANY_RETRIES					0x050a
+#define IRDMA_AE_LLP_TOO_MANY_KEEPALIVE_RETRIES				0x050b
+#define IRDMA_AE_LLP_DOUBT_REACHABILITY					0x050c
+#define IRDMA_AE_LLP_CONNECTION_ESTABLISHED				0x050e
+#define IRDMA_AE_LLP_TOO_MANY_RNRS					0x050f
+#define IRDMA_AE_RESOURCE_EXHAUSTION					0x0520
+#define IRDMA_AE_RESET_SENT						0x0601
+#define IRDMA_AE_TERMINATE_SENT						0x0602
+#define IRDMA_AE_RESET_NOT_SENT						0x0603
+#define IRDMA_AE_LCE_QP_CATASTROPHIC					0x0700
+#define IRDMA_AE_LCE_FUNCTION_CATASTROPHIC				0x0701
+#define IRDMA_AE_LCE_CQ_CATASTROPHIC					0x0702
+#define IRDMA_AE_REMOTE_QP_CATASTROPHIC					0x0703
+#define IRDMA_AE_LOCAL_QP_CATASTROPHIC					0x0704
+#define IRDMA_AE_RCE_QP_CATASTROPHIC					0x0705
+#define IRDMA_AE_QP_SUSPEND_COMPLETE					0x0900
+#define IRDMA_AE_CQP_DEFERRED_COMPLETE					0x0901
+#define IRDMA_AE_ADAPTER_CATASTROPHIC					0x0B0B
 
 enum irdma_device_caps_const {
 	IRDMA_WQE_SIZE =			4,
@@ -55,11 +159,12 @@ enum irdma_device_caps_const {
 	IRDMA_CEQE_SIZE =			1,
 	IRDMA_CQP_CTX_SIZE =			8,
 	IRDMA_SHADOW_AREA_SIZE =		8,
-	IRDMA_QUERY_FPM_BUF_SIZE =		176,
-	IRDMA_COMMIT_FPM_BUF_SIZE =		176,
+	IRDMA_QUERY_FPM_BUF_SIZE =		192,
+	IRDMA_COMMIT_FPM_BUF_SIZE =		192,
 	IRDMA_GATHER_STATS_BUF_SIZE =		1024,
 	IRDMA_MIN_IW_QP_ID =			0,
 	IRDMA_MAX_IW_QP_ID =			262143,
+	IRDMA_MIN_IW_SRQ_ID =			0,
 	IRDMA_MIN_CEQID =			0,
 	IRDMA_MAX_CEQID =			1023,
 	IRDMA_CEQ_MAX_COUNT =			IRDMA_MAX_CEQID + 1,
@@ -67,6 +172,7 @@ enum irdma_device_caps_const {
 	IRDMA_MAX_CQID =			524287,
 	IRDMA_MIN_AEQ_ENTRIES =			1,
 	IRDMA_MAX_AEQ_ENTRIES =			524287,
+	IRDMA_MAX_AEQ_ENTRIES_GEN_3 =           262144,
 	IRDMA_MIN_CEQ_ENTRIES =			1,
 	IRDMA_MAX_CEQ_ENTRIES =			262143,
 	IRDMA_MIN_CQ_SIZE =			1,
@@ -105,6 +211,13 @@ enum irdma_flush_opcode {
 	FLUSH_RETRY_EXC_ERR,
 	FLUSH_MW_BIND_ERR,
 	FLUSH_REM_INV_REQ_ERR,
+	FLUSH_RNR_RETRY_EXC_ERR,
+};
+
+enum irdma_qp_event_type {
+	IRDMA_QP_EVENT_CATASTROPHIC,
+	IRDMA_QP_EVENT_ACCESS_ERR,
+	IRDMA_QP_EVENT_REQ_ERR,
 };
 
 enum irdma_cmpl_status {
@@ -147,6 +260,8 @@ enum irdma_qp_caps {
 	IRDMA_PUSH_MODE      = 8,
 };
 
+struct irdma_srq_uk;
+struct irdma_srq_uk_init_info;
 struct irdma_qp_uk;
 struct irdma_cq_uk;
 struct irdma_qp_uk_init_info;
@@ -201,6 +316,24 @@ struct irdma_bind_window {
 	bool ena_writes:1;
 	irdma_stag mw_stag;
 	bool mem_window_type_1:1;
+	bool remote_atomics_en:1;
+};
+
+struct irdma_atomic_fetch_add {
+	u64 tagged_offset;
+	u64 remote_tagged_offset;
+	u64 fetch_add_data_bytes;
+	u32 stag;
+	u32 remote_stag;
+};
+
+struct irdma_atomic_compare_swap {
+	u64 tagged_offset;
+	u64 remote_tagged_offset;
+	u64 swap_data_bytes;
+	u64 compare_data_bytes;
+	u32 stag;
+	u32 remote_stag;
 };
 
 struct irdma_inv_local_stag {
@@ -219,6 +352,7 @@ struct irdma_post_sq_info {
 	bool report_rtt:1;
 	bool udp_hdr:1;
 	bool defer_flag:1;
+	bool remote_atomic_en:1;
 	u32 imm_data;
 	u32 stag_to_inv;
 	union {
@@ -227,6 +361,8 @@ struct irdma_post_sq_info {
 		struct irdma_rdma_read rdma_read;
 		struct irdma_bind_window bind_window;
 		struct irdma_inv_local_stag inv_local_stag;
+		struct irdma_atomic_fetch_add atomic_fetch_add;
+		struct irdma_atomic_compare_swap atomic_compare_swap;
 	} op;
 };
 
@@ -255,6 +391,15 @@ struct irdma_cq_poll_info {
 	bool imm_valid:1;
 };
 
+struct qp_err_code {
+	enum irdma_flush_opcode flush_code;
+	enum irdma_qp_event_type event_type;
+};
+
+int irdma_uk_atomic_compare_swap(struct irdma_qp_uk *qp,
+				 struct irdma_post_sq_info *info, bool post_sq);
+int irdma_uk_atomic_fetch_add(struct irdma_qp_uk *qp,
+			      struct irdma_post_sq_info *info, bool post_sq);
 int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp,
 			       struct irdma_post_sq_info *info, bool post_sq);
 int irdma_uk_inline_send(struct irdma_qp_uk *qp,
@@ -300,6 +445,39 @@ int irdma_uk_calc_depth_shift_sq(struct irdma_qp_uk_init_info *ukinfo,
 				 u32 *sq_depth, u8 *sq_shift);
 int irdma_uk_calc_depth_shift_rq(struct irdma_qp_uk_init_info *ukinfo,
 				 u32 *rq_depth, u8 *rq_shift);
+int irdma_uk_srq_init(struct irdma_srq_uk *srq,
+		      struct irdma_srq_uk_init_info *info);
+int irdma_uk_srq_post_receive(struct irdma_srq_uk *srq,
+			      struct irdma_post_rq_info *info);
+
+struct irdma_srq_uk {
+	u32 srq_caps;
+	struct irdma_qp_quanta *srq_base;
+	struct irdma_uk_attrs *uk_attrs;
+	__le64 *shadow_area;
+	struct irdma_ring srq_ring;
+	struct irdma_ring initial_ring;
+	u32 srq_id;
+	u32 srq_size;
+	u32 max_srq_frag_cnt;
+	struct irdma_wqe_uk_ops wqe_ops;
+	u8 srwqe_polarity;
+	u8 wqe_size;
+	u8 wqe_size_multiplier;
+	u8 deferred_flag;
+};
+
+struct irdma_srq_uk_init_info {
+	struct irdma_qp_quanta *srq;
+	struct irdma_uk_attrs *uk_attrs;
+	__le64 *shadow_area;
+	u64 *srq_wrid_array;
+	u32 srq_id;
+	u32 srq_caps;
+	u32 srq_size;
+	u32 max_srq_frag_cnt;
+};
+
 struct irdma_sq_uk_wr_trk_info {
 	u64 wrid;
 	u32 wr_len;
@@ -344,6 +522,7 @@ struct irdma_qp_uk {
 	bool destroy_pending:1; /* Indicates the QP is being destroyed */
 	void *back_qp;
 	u8 dbg_rq_flushed;
+	struct irdma_srq_uk *srq_uk;
 	u8 sq_flush_seen;
 	u8 rq_flush_seen;
 };
@@ -383,6 +562,7 @@ struct irdma_qp_uk_init_info {
 	u8 rq_shift;
 	int abi_ver;
 	bool legacy_mode;
+	struct irdma_srq_uk *srq_uk;
 };
 
 struct irdma_cq_uk_init_info {
@@ -398,6 +578,7 @@ struct irdma_cq_uk_init_info {
 __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx,
 				   u16 quanta, u32 total_size,
 				   struct irdma_post_sq_info *info);
+__le64 *irdma_srq_get_next_recv_wqe(struct irdma_srq_uk *srq, u32 *wqe_idx);
 __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx);
 void irdma_uk_clean_cq(void *q, struct irdma_cq_uk *cq);
 int irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq);
@@ -409,5 +590,85 @@ int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift,
 		      u32 *wqdepth);
 int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift,
 		      u32 *wqdepth);
+int irdma_get_srqdepth(struct irdma_uk_attrs *uk_attrs, u32 srq_size, u8 shift,
+		       u32 *srqdepth);
 void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx);
+
+static inline struct qp_err_code irdma_ae_to_qp_err_code(u16 ae_id)
+{
+	struct qp_err_code qp_err = {};
+
+	switch (ae_id) {
+	case IRDMA_AE_AMP_BOUNDS_VIOLATION:
+	case IRDMA_AE_AMP_INVALID_STAG:
+	case IRDMA_AE_AMP_RIGHTS_VIOLATION:
+	case IRDMA_AE_AMP_UNALLOCATED_STAG:
+	case IRDMA_AE_AMP_BAD_PD:
+	case IRDMA_AE_AMP_BAD_QP:
+	case IRDMA_AE_AMP_BAD_STAG_KEY:
+	case IRDMA_AE_AMP_BAD_STAG_INDEX:
+	case IRDMA_AE_AMP_TO_WRAP:
+	case IRDMA_AE_PRIV_OPERATION_DENIED:
+		qp_err.flush_code = FLUSH_PROT_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR;
+		break;
+	case IRDMA_AE_UDA_XMIT_BAD_PD:
+	case IRDMA_AE_WQE_UNEXPECTED_OPCODE:
+		qp_err.flush_code = FLUSH_LOC_QP_OP_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	case IRDMA_AE_UDA_XMIT_DGRAM_TOO_SHORT:
+	case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG:
+	case IRDMA_AE_UDA_L4LEN_INVALID:
+	case IRDMA_AE_DDP_UBE_INVALID_MO:
+	case IRDMA_AE_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		qp_err.flush_code = FLUSH_LOC_LEN_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	case IRDMA_AE_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+	case IRDMA_AE_IB_REMOTE_ACCESS_ERROR:
+		qp_err.flush_code = FLUSH_REM_ACCESS_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR;
+		break;
+	case IRDMA_AE_AMP_MWBIND_INVALID_RIGHTS:
+	case IRDMA_AE_AMP_MWBIND_BIND_DISABLED:
+	case IRDMA_AE_AMP_MWBIND_INVALID_BOUNDS:
+	case IRDMA_AE_AMP_MWBIND_VALID_STAG:
+		qp_err.flush_code = FLUSH_MW_BIND_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_ACCESS_ERR;
+		break;
+	case IRDMA_AE_LLP_TOO_MANY_RETRIES:
+		qp_err.flush_code = FLUSH_RETRY_EXC_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	case IRDMA_AE_IB_INVALID_REQUEST:
+		qp_err.flush_code = FLUSH_REM_INV_REQ_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_REQ_ERR;
+		break;
+	case IRDMA_AE_LLP_SEGMENT_TOO_SMALL:
+	case IRDMA_AE_LLP_RECEIVED_MPA_CRC_ERROR:
+	case IRDMA_AE_ROCE_RSP_LENGTH_ERROR:
+	case IRDMA_AE_IB_REMOTE_OP_ERROR:
+		qp_err.flush_code = FLUSH_REM_OP_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	case IRDMA_AE_LLP_TOO_MANY_RNRS:
+		qp_err.flush_code = FLUSH_RNR_RETRY_EXC_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	case IRDMA_AE_LCE_QP_CATASTROPHIC:
+	case IRDMA_AE_REMOTE_QP_CATASTROPHIC:
+	case IRDMA_AE_LOCAL_QP_CATASTROPHIC:
+	case IRDMA_AE_RCE_QP_CATASTROPHIC:
+		qp_err.flush_code = FLUSH_FATAL_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	default:
+		qp_err.flush_code = FLUSH_GENERAL_ERR;
+		qp_err.event_type = IRDMA_QP_EVENT_CATASTROPHIC;
+		break;
+	}
+
+	return qp_err;
+}
 #endif /* IRDMA_USER_H */
diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c
index b510ef747399..8b94d87b0192 100644
--- a/drivers/infiniband/hw/irdma/utils.c
+++ b/drivers/infiniband/hw/irdma/utils.c
@@ -481,6 +481,7 @@ void irdma_free_cqp_request(struct irdma_cqp *cqp,
 		WRITE_ONCE(cqp_request->request_done, false);
 		cqp_request->callback_fcn = NULL;
 		cqp_request->waiting = false;
+		cqp_request->pending = false;
 
 		spin_lock_irqsave(&cqp->req_lock, flags);
 		list_add_tail(&cqp_request->list, &cqp->cqp_avail_reqs);
@@ -521,6 +522,22 @@ irdma_free_pending_cqp_request(struct irdma_cqp *cqp,
 }
 
 /**
+ * irdma_cleanup_deferred_cqp_ops - clean-up cqp with no completions
+ * @dev: sc_dev
+ * @cqp: cqp
+ */
+static void irdma_cleanup_deferred_cqp_ops(struct irdma_sc_dev *dev,
+					   struct irdma_cqp *cqp)
+{
+	u64 scratch;
+
+	/* process all CQP requests with deferred/pending completions */
+	while ((scratch = irdma_sc_cqp_cleanup_handler(dev)))
+		irdma_free_pending_cqp_request(cqp, (struct irdma_cqp_request *)
+						    (uintptr_t)scratch);
+}
+
+/**
  * irdma_cleanup_pending_cqp_op - clean-up cqp with no
  * completions
  * @rf: RDMA PCI function
@@ -533,6 +550,8 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf)
 	struct cqp_cmds_info *pcmdinfo = NULL;
 	u32 i, pending_work, wqe_idx;
 
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		irdma_cleanup_deferred_cqp_ops(dev, cqp);
 	pending_work = IRDMA_RING_USED_QUANTA(cqp->sc_cqp.sq_ring);
 	wqe_idx = IRDMA_RING_CURRENT_TAIL(cqp->sc_cqp.sq_ring);
 	for (i = 0; i < pending_work; i++) {
@@ -552,6 +571,26 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf)
 	}
 }
 
+static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev)
+{
+	u16 time_s = dev->vc_caps.cqp_timeout_s;
+
+	if (!time_s)
+		return CQP_TIMEOUT_THRESHOLD;
+
+	return time_s * 1000 / dev->hw_attrs.max_cqp_compl_wait_time_ms;
+}
+
+static int irdma_get_def_timeout_threshold(struct irdma_sc_dev *dev)
+{
+	u16 time_s = dev->vc_caps.cqp_def_timeout_s;
+
+	if (!time_s)
+		return CQP_DEF_CMPL_TIMEOUT_THRESHOLD;
+
+	return time_s * 1000 / dev->hw_attrs.max_cqp_compl_wait_time_ms;
+}
+
 /**
  * irdma_wait_event - wait for completion
  * @rf: RDMA PCI function
@@ -561,6 +600,7 @@ static int irdma_wait_event(struct irdma_pci_f *rf,
 			    struct irdma_cqp_request *cqp_request)
 {
 	struct irdma_cqp_timeout cqp_timeout = {};
+	int timeout_threshold = irdma_get_timeout_threshold(&rf->sc_dev);
 	bool cqp_error = false;
 	int err_code = 0;
 
@@ -572,9 +612,17 @@ static int irdma_wait_event(struct irdma_pci_f *rf,
 				       msecs_to_jiffies(CQP_COMPL_WAIT_TIME_MS)))
 			break;
 
+		if (cqp_request->pending)
+			/* There was a deferred or pending completion
+			 * received for this CQP request, so we need
+			 * to wait longer than usual.
+			 */
+			timeout_threshold =
+				irdma_get_def_timeout_threshold(&rf->sc_dev);
+
 		irdma_check_cqp_progress(&cqp_timeout, &rf->sc_dev);
 
-		if (cqp_timeout.count < CQP_TIMEOUT_THRESHOLD)
+		if (cqp_timeout.count < timeout_threshold)
 			continue;
 
 		if (!rf->reset) {
@@ -649,6 +697,9 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = {
 	[IRDMA_OP_ADD_LOCAL_MAC_ENTRY] = "Add Local MAC Entry Cmd",
 	[IRDMA_OP_DELETE_LOCAL_MAC_ENTRY] = "Delete Local MAC Entry Cmd",
 	[IRDMA_OP_CQ_MODIFY] = "CQ Modify Cmd",
+	[IRDMA_OP_SRQ_CREATE] = "Create SRQ Cmd",
+	[IRDMA_OP_SRQ_MODIFY] = "Modify SRQ Cmd",
+	[IRDMA_OP_SRQ_DESTROY] = "Destroy SRQ Cmd",
 };
 
 static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = {
@@ -1065,6 +1116,26 @@ static void irdma_dealloc_push_page(struct irdma_pci_f *rf,
 	irdma_put_cqp_request(&rf->cqp, cqp_request);
 }
 
+static void irdma_free_gsi_qp_rsrc(struct irdma_qp *iwqp, u32 qp_num)
+{
+	struct irdma_device *iwdev = iwqp->iwdev;
+	struct irdma_pci_f *rf = iwdev->rf;
+	unsigned long flags;
+
+	if (rf->sc_dev.hw_attrs.uk_attrs.hw_rev < IRDMA_GEN_3)
+		return;
+
+	irdma_vchnl_req_del_vport(&rf->sc_dev, iwdev->vport_id, qp_num);
+
+	if (qp_num == 1) {
+		spin_lock_irqsave(&rf->rsrc_lock, flags);
+		rf->hwqp1_rsvd = false;
+		spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+	} else if (qp_num > 2) {
+		irdma_free_rsrc(rf, rf->allocated_qps, qp_num);
+	}
+}
+
 /**
  * irdma_free_qp_rsrc - free up memory resources for qp
  * @iwqp: qp ptr (user or kernel)
@@ -1073,7 +1144,7 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp)
 {
 	struct irdma_device *iwdev = iwqp->iwdev;
 	struct irdma_pci_f *rf = iwdev->rf;
-	u32 qp_num = iwqp->ibqp.qp_num;
+	u32 qp_num = iwqp->sc_qp.qp_uk.qp_id;
 
 	irdma_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp);
 	irdma_dealloc_push_page(rf, &iwqp->sc_qp);
@@ -1083,8 +1154,12 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp)
 					   iwqp->sc_qp.user_pri);
 	}
 
-	if (qp_num > 2)
-		irdma_free_rsrc(rf, rf->allocated_qps, qp_num);
+	if (iwqp->ibqp.qp_type == IB_QPT_GSI) {
+		irdma_free_gsi_qp_rsrc(iwqp, qp_num);
+	} else {
+		if (qp_num > 2)
+			irdma_free_rsrc(rf, rf->allocated_qps, qp_num);
+	}
 	dma_free_coherent(rf->sc_dev.hw->device, iwqp->q2_ctx_mem.size,
 			  iwqp->q2_ctx_mem.va, iwqp->q2_ctx_mem.pa);
 	iwqp->q2_ctx_mem.va = NULL;
@@ -1096,6 +1171,30 @@ void irdma_free_qp_rsrc(struct irdma_qp *iwqp)
 }
 
 /**
+ * irdma_srq_wq_destroy - send srq destroy cqp
+ * @rf: RDMA PCI function
+ * @srq: hardware control srq
+ */
+void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq)
+{
+	struct irdma_cqp_request *cqp_request;
+	struct cqp_cmds_info *cqp_info;
+
+	cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true);
+	if (!cqp_request)
+		return;
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = IRDMA_OP_SRQ_DESTROY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.srq_destroy.srq = srq;
+	cqp_info->in.u.srq_destroy.scratch = (uintptr_t)cqp_request;
+
+	irdma_handle_cqp_op(rf, cqp_request);
+	irdma_put_cqp_request(&rf->cqp, cqp_request);
+}
+
+/**
  * irdma_cq_wq_destroy - send cq destroy cqp
  * @rf: RDMA PCI function
  * @cq: hardware control cq
@@ -2266,7 +2365,10 @@ bool irdma_cq_empty(struct irdma_cq *iwcq)
 	u8 polarity;
 
 	ukcq  = &iwcq->sc_cq.cq_uk;
-	cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
+	if (ukcq->avoid_mem_cflct)
+		cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(ukcq);
+	else
+		cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
 	get_64bit_val(cqe, 24, &qword3);
 	polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3);
 
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index da5a41b275d8..76ce6137f2ba 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -41,7 +41,8 @@ static int irdma_query_device(struct ib_device *ibdev,
 	props->max_cq = rf->max_cq - rf->used_cqs;
 	props->max_cqe = rf->max_cqe - 1;
 	props->max_mr = rf->max_mr - rf->used_mrs;
-	props->max_mw = props->max_mr;
+	if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3)
+		props->max_mw = props->max_mr;
 	props->max_pd = rf->max_pd - rf->used_pds;
 	props->max_sge_rd = hw_attrs->uk_attrs.max_hw_read_sges;
 	props->max_qp_rd_atom = hw_attrs->max_hw_ird;
@@ -56,9 +57,21 @@ static int irdma_query_device(struct ib_device *ibdev,
 	props->max_mcast_qp_attach = IRDMA_MAX_MGS_PER_CTX;
 	props->max_total_mcast_qp_attach = rf->max_qp * IRDMA_MAX_MGS_PER_CTX;
 	props->max_fast_reg_page_list_len = IRDMA_MAX_PAGES_PER_FMR;
-#define HCA_CLOCK_TIMESTAMP_MASK 0x1ffff
-	if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_2)
-		props->timestamp_mask = HCA_CLOCK_TIMESTAMP_MASK;
+	props->max_srq = rf->max_srq - rf->used_srqs;
+	props->max_srq_wr = IRDMA_MAX_SRQ_WRS;
+	props->max_srq_sge = hw_attrs->uk_attrs.max_hw_wq_frags;
+	if (hw_attrs->uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
+		props->atomic_cap = IB_ATOMIC_HCA;
+	else
+		props->atomic_cap = IB_ATOMIC_NONE;
+	props->masked_atomic_cap = props->atomic_cap;
+	if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3) {
+#define HCA_CORE_CLOCK_KHZ 1000000UL
+		props->timestamp_mask = GENMASK(31, 0);
+		props->hca_core_clock = HCA_CORE_CLOCK_KHZ;
+	}
+	if (hw_attrs->uk_attrs.hw_rev >= IRDMA_GEN_3)
+		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
 
 	return 0;
 }
@@ -292,6 +305,10 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx,
 	ucontext->iwdev = iwdev;
 	ucontext->abi_ver = req.userspace_ver;
 
+	if (!(req.comp_mask & IRDMA_SUPPORT_WQE_FORMAT_V2) &&
+	    uk_attrs->hw_rev >= IRDMA_GEN_3)
+		return -EOPNOTSUPP;
+
 	if (req.comp_mask & IRDMA_ALLOC_UCTX_USE_RAW_ATTR)
 		ucontext->use_raw_attrs = true;
 
@@ -332,6 +349,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx,
 		uresp.comp_mask |= IRDMA_ALLOC_UCTX_USE_RAW_ATTR;
 		uresp.min_hw_wq_size = uk_attrs->min_hw_wq_size;
 		uresp.comp_mask |= IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE;
+		uresp.max_hw_srq_quanta = uk_attrs->max_hw_srq_quanta;
+		uresp.comp_mask |= IRDMA_ALLOC_UCTX_MAX_HW_SRQ_QUANTA;
 		if (ib_copy_to_udata(udata, &uresp,
 				     min(sizeof(uresp), udata->outlen))) {
 			rdma_user_mmap_entry_remove(ucontext->db_mmap_entry);
@@ -343,6 +362,8 @@ static int irdma_alloc_ucontext(struct ib_ucontext *uctx,
 	spin_lock_init(&ucontext->cq_reg_mem_list_lock);
 	INIT_LIST_HEAD(&ucontext->qp_reg_mem_list);
 	spin_lock_init(&ucontext->qp_reg_mem_list_lock);
+	INIT_LIST_HEAD(&ucontext->srq_reg_mem_list);
+	spin_lock_init(&ucontext->srq_reg_mem_list_lock);
 
 	return 0;
 
@@ -521,7 +542,7 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
 	iwqp->sc_qp.qp_uk.destroy_pending = true;
 
-	if (iwqp->iwarp_state == IRDMA_QP_STATE_RTS)
+	if (iwqp->iwarp_state >= IRDMA_QP_STATE_IDLE)
 		irdma_modify_qp_to_err(&iwqp->sc_qp);
 
 	if (!iwqp->user_mode)
@@ -541,6 +562,9 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp);
 
 	irdma_remove_push_mmap_entries(iwqp);
+
+	if (iwqp->sc_qp.qp_uk.qp_id == 1)
+		iwdev->rf->hwqp1_rsvd = false;
 	irdma_free_qp_rsrc(iwqp);
 
 	return 0;
@@ -564,7 +588,11 @@ static void irdma_setup_virt_qp(struct irdma_device *iwdev,
 	if (iwpbl->pbl_allocated) {
 		init_info->virtual_map = true;
 		init_info->sq_pa = qpmr->sq_pbl.idx;
-		init_info->rq_pa = qpmr->rq_pbl.idx;
+		/* Need to use contiguous buffer for RQ of QP
+		 * in case it is associated with SRQ.
+		 */
+		init_info->rq_pa = init_info->qp_uk_init_info.srq_uk ?
+			qpmr->rq_pa : qpmr->rq_pbl.idx;
 	} else {
 		init_info->sq_pa = qpmr->sq_pbl.addr;
 		init_info->rq_pa = qpmr->rq_pbl.addr;
@@ -719,6 +747,7 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev,
 		info->rq_pa + (ukinfo->rq_depth * IRDMA_QP_WQE_MIN_SIZE);
 	ukinfo->sq_size = ukinfo->sq_depth >> ukinfo->sq_shift;
 	ukinfo->rq_size = ukinfo->rq_depth >> ukinfo->rq_shift;
+	ukinfo->qp_id = info->qp_uk_init_info.qp_id;
 
 	iwqp->max_send_wr = (ukinfo->sq_depth - IRDMA_SQ_RSVD) >> ukinfo->sq_shift;
 	iwqp->max_recv_wr = (ukinfo->rq_depth - IRDMA_RQ_RSVD) >> ukinfo->rq_shift;
@@ -775,9 +804,12 @@ static void irdma_roce_fill_and_set_qpctx_info(struct irdma_qp *iwqp,
 	roce_info = &iwqp->roce_info;
 	ether_addr_copy(roce_info->mac_addr, iwdev->netdev->dev_addr);
 
+	if (iwqp->ibqp.qp_type == IB_QPT_GSI && iwqp->ibqp.qp_num != 1)
+		roce_info->is_qp1 = true;
 	roce_info->rd_en = true;
 	roce_info->wr_rdresp_en = true;
-	roce_info->bind_en = true;
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		roce_info->bind_en = true;
 	roce_info->dcqcn_en = false;
 	roce_info->rtomin = 5;
 
@@ -808,7 +840,6 @@ static void irdma_iw_fill_and_set_qpctx_info(struct irdma_qp *iwqp,
 	ether_addr_copy(iwarp_info->mac_addr, iwdev->netdev->dev_addr);
 	iwarp_info->rd_en = true;
 	iwarp_info->wr_rdresp_en = true;
-	iwarp_info->bind_en = true;
 	iwarp_info->ecn_en = true;
 	iwarp_info->rtomin = 5;
 
@@ -864,6 +895,47 @@ static void irdma_flush_worker(struct work_struct *work)
 	irdma_generate_flush_completions(iwqp);
 }
 
+static int irdma_setup_gsi_qp_rsrc(struct irdma_qp *iwqp, u32 *qp_num)
+{
+	struct irdma_device *iwdev = iwqp->iwdev;
+	struct irdma_pci_f *rf = iwdev->rf;
+	unsigned long flags;
+	int ret;
+
+	if (rf->rdma_ver <= IRDMA_GEN_2) {
+		*qp_num = 1;
+		return 0;
+	}
+
+	spin_lock_irqsave(&rf->rsrc_lock, flags);
+	if (!rf->hwqp1_rsvd) {
+		*qp_num = 1;
+		rf->hwqp1_rsvd = true;
+		spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+	} else {
+		spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+		ret = irdma_alloc_rsrc(rf, rf->allocated_qps, rf->max_qp,
+				       qp_num, &rf->next_qp);
+		if (ret)
+			return ret;
+	}
+
+	ret = irdma_vchnl_req_add_vport(&rf->sc_dev, iwdev->vport_id, *qp_num,
+					(&iwdev->vsi)->qos);
+	if (ret) {
+		if (*qp_num != 1) {
+			irdma_free_rsrc(rf, rf->allocated_qps, *qp_num);
+		} else {
+			spin_lock_irqsave(&rf->rsrc_lock, flags);
+			rf->hwqp1_rsvd = false;
+			spin_unlock_irqrestore(&rf->rsrc_lock, flags);
+		}
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * irdma_create_qp - create qp
  * @ibqp: ptr of qp
@@ -889,6 +961,18 @@ static int irdma_create_qp(struct ib_qp *ibqp,
 	struct irdma_uk_attrs *uk_attrs = &dev->hw_attrs.uk_attrs;
 	struct irdma_qp_init_info init_info = {};
 	struct irdma_qp_host_ctx_info *ctx_info;
+	struct irdma_srq *iwsrq;
+	bool srq_valid = false;
+	u32 srq_id = 0;
+
+	if (init_attr->srq) {
+		iwsrq = to_iwsrq(init_attr->srq);
+		srq_valid = true;
+		srq_id = iwsrq->srq_num;
+		init_attr->cap.max_recv_sge = uk_attrs->max_hw_wq_frags;
+		init_attr->cap.max_recv_wr = 4;
+		init_info.qp_uk_init_info.srq_uk = &iwsrq->sc_srq.srq_uk;
+	}
 
 	err_code = irdma_validate_qp_attrs(init_attr, iwdev);
 	if (err_code)
@@ -925,16 +1009,20 @@ static int irdma_create_qp(struct ib_qp *ibqp,
 	init_info.host_ctx = (__le64 *)(init_info.q2 + IRDMA_Q2_BUF_SIZE);
 	init_info.host_ctx_pa = init_info.q2_pa + IRDMA_Q2_BUF_SIZE;
 
-	if (init_attr->qp_type == IB_QPT_GSI)
-		qp_num = 1;
-	else
+	if (init_attr->qp_type == IB_QPT_GSI) {
+		err_code = irdma_setup_gsi_qp_rsrc(iwqp, &qp_num);
+		if (err_code)
+			goto error;
+		iwqp->ibqp.qp_num = 1;
+	} else {
 		err_code = irdma_alloc_rsrc(rf, rf->allocated_qps, rf->max_qp,
 					    &qp_num, &rf->next_qp);
-	if (err_code)
-		goto error;
+		if (err_code)
+			goto error;
+		iwqp->ibqp.qp_num = qp_num;
+	}
 
 	iwqp->iwpd = iwpd;
-	iwqp->ibqp.qp_num = qp_num;
 	qp = &iwqp->sc_qp;
 	iwqp->iwscq = to_iwcq(init_attr->send_cq);
 	iwqp->iwrcq = to_iwcq(init_attr->recv_cq);
@@ -991,13 +1079,22 @@ static int irdma_create_qp(struct ib_qp *ibqp,
 	}
 
 	ctx_info = &iwqp->ctx_info;
+	ctx_info->srq_valid = srq_valid;
+	ctx_info->srq_id = srq_id;
 	ctx_info->send_cq_num = iwqp->iwscq->sc_cq.cq_uk.cq_id;
 	ctx_info->rcv_cq_num = iwqp->iwrcq->sc_cq.cq_uk.cq_id;
 
-	if (rdma_protocol_roce(&iwdev->ibdev, 1))
+	if (rdma_protocol_roce(&iwdev->ibdev, 1)) {
+		if (dev->ws_add(&iwdev->vsi, 0)) {
+			irdma_cqp_qp_destroy_cmd(&rf->sc_dev, &iwqp->sc_qp);
+			err_code = -EINVAL;
+			goto error;
+		}
+		irdma_qp_add_qos(&iwqp->sc_qp);
 		irdma_roce_fill_and_set_qpctx_info(iwqp, ctx_info);
-	else
+	} else {
 		irdma_iw_fill_and_set_qpctx_info(iwqp, ctx_info);
+	}
 
 	err_code = irdma_cqp_create_qp_cmd(iwqp);
 	if (err_code)
@@ -1009,16 +1106,6 @@ static int irdma_create_qp(struct ib_qp *ibqp,
 	iwqp->sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
 	rf->qp_table[qp_num] = iwqp;
 
-	if (rdma_protocol_roce(&iwdev->ibdev, 1)) {
-		if (dev->ws_add(&iwdev->vsi, 0)) {
-			irdma_cqp_qp_destroy_cmd(&rf->sc_dev, &iwqp->sc_qp);
-			err_code = -EINVAL;
-			goto error;
-		}
-
-		irdma_qp_add_qos(&iwqp->sc_qp);
-	}
-
 	if (udata) {
 		/* GEN_1 legacy support with libi40iw does not have expanded uresp struct */
 		if (udata->outlen < sizeof(uresp)) {
@@ -1063,6 +1150,8 @@ static int irdma_get_ib_acc_flags(struct irdma_qp *iwqp)
 			acc_flags |= IB_ACCESS_REMOTE_READ;
 		if (iwqp->roce_info.bind_en)
 			acc_flags |= IB_ACCESS_MW_BIND;
+		if (iwqp->ctx_info.remote_atomics_en)
+			acc_flags |= IB_ACCESS_REMOTE_ATOMIC;
 	} else {
 		if (iwqp->iwarp_info.wr_rdresp_en) {
 			acc_flags |= IB_ACCESS_LOCAL_WRITE;
@@ -1070,8 +1159,8 @@ static int irdma_get_ib_acc_flags(struct irdma_qp *iwqp)
 		}
 		if (iwqp->iwarp_info.rd_en)
 			acc_flags |= IB_ACCESS_REMOTE_READ;
-		if (iwqp->iwarp_info.bind_en)
-			acc_flags |= IB_ACCESS_MW_BIND;
+		if (iwqp->ctx_info.remote_atomics_en)
+			acc_flags |= IB_ACCESS_REMOTE_ATOMIC;
 	}
 	return acc_flags;
 }
@@ -1110,6 +1199,7 @@ static int irdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		attr->pkey_index = iwqp->roce_info.p_key;
 		attr->retry_cnt = iwqp->udp_info.rexmit_thresh;
 		attr->rnr_retry = iwqp->udp_info.rnr_nak_thresh;
+		attr->min_rnr_timer = iwqp->udp_info.min_rnr_timer;
 		attr->max_rd_atomic = iwqp->roce_info.ord_size;
 		attr->max_dest_rd_atomic = iwqp->roce_info.ird_size;
 	}
@@ -1118,6 +1208,7 @@ static int irdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	init_attr->qp_context = iwqp->ibqp.qp_context;
 	init_attr->send_cq = iwqp->ibqp.send_cq;
 	init_attr->recv_cq = iwqp->ibqp.recv_cq;
+	init_attr->srq = iwqp->ibqp.srq;
 	init_attr->cap = attr->cap;
 
 	return 0;
@@ -1242,6 +1333,10 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	if (attr_mask & IB_QP_RNR_RETRY)
 		udp_info->rnr_nak_thresh = attr->rnr_retry;
 
+	if (attr_mask & IB_QP_MIN_RNR_TIMER &&
+	    dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3)
+		udp_info->min_rnr_timer = attr->min_rnr_timer;
+
 	if (attr_mask & IB_QP_RETRY_CNT)
 		udp_info->rexmit_thresh = attr->retry_cnt;
 
@@ -1362,6 +1457,9 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			roce_info->wr_rdresp_en = true;
 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
 			roce_info->rd_en = true;
+		if (dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
+			if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)
+				ctx_info->remote_atomics_en = true;
 	}
 
 	wait_event(iwqp->mod_qp_waitq, !atomic_read(&iwqp->hw_mod_qp_pend));
@@ -1777,6 +1875,24 @@ exit:
 }
 
 /**
+ * irdma_srq_free_rsrc - free up resources for srq
+ * @rf: RDMA PCI function
+ * @iwsrq: srq ptr
+ */
+static void irdma_srq_free_rsrc(struct irdma_pci_f *rf, struct irdma_srq *iwsrq)
+{
+	struct irdma_sc_srq *srq = &iwsrq->sc_srq;
+
+	if (!iwsrq->user_mode) {
+		dma_free_coherent(rf->sc_dev.hw->device, iwsrq->kmem.size,
+				  iwsrq->kmem.va, iwsrq->kmem.pa);
+		iwsrq->kmem.va = NULL;
+	}
+
+	irdma_free_rsrc(rf, rf->allocated_srqs, srq->srq_uk.srq_id);
+}
+
+/**
  * irdma_cq_free_rsrc - free up resources for cq
  * @rf: RDMA PCI function
  * @iwcq: cq ptr
@@ -1840,6 +1956,22 @@ static int irdma_process_resize_list(struct irdma_cq *iwcq,
 }
 
 /**
+ * irdma_destroy_srq - destroy srq
+ * @ibsrq: srq pointer
+ * @udata: user data
+ */
+static int irdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+{
+	struct irdma_device *iwdev = to_iwdev(ibsrq->device);
+	struct irdma_srq *iwsrq = to_iwsrq(ibsrq);
+	struct irdma_sc_srq *srq = &iwsrq->sc_srq;
+
+	irdma_srq_wq_destroy(iwdev->rf, srq);
+	irdma_srq_free_rsrc(iwdev->rf, iwsrq);
+	return 0;
+}
+
+/**
  * irdma_destroy_cq - destroy cq
  * @ib_cq: cq pointer
  * @udata: user data
@@ -1914,8 +2046,13 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries,
 
 	if (!iwcq->user_mode) {
 		entries++;
-		if (rf->sc_dev.hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
+
+		if (!iwcq->sc_cq.cq_uk.avoid_mem_cflct &&
+		    dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
 			entries *= 2;
+
+		if (entries & 1)
+			entries += 1; /* cq size must be an even number */
 	}
 
 	info.cq_size = max(entries, 4);
@@ -2022,10 +2159,297 @@ error:
 	return ret;
 }
 
+/**
+ * irdma_srq_event - event notification for srq limit
+ * @srq: shared srq struct
+ */
+void irdma_srq_event(struct irdma_sc_srq *srq)
+{
+	struct irdma_srq *iwsrq = container_of(srq, struct irdma_srq, sc_srq);
+	struct ib_srq *ibsrq = &iwsrq->ibsrq;
+	struct ib_event event;
+
+	srq->srq_limit = 0;
+
+	if (!ibsrq->event_handler)
+		return;
+
+	event.device = ibsrq->device;
+	event.element.port_num = 1;
+	event.element.srq = ibsrq;
+	event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+	ibsrq->event_handler(&event, ibsrq->srq_context);
+}
+
+/**
+ * irdma_modify_srq - modify srq request
+ * @ibsrq: srq's pointer for modify
+ * @attr: access attributes
+ * @attr_mask: state mask
+ * @udata: user data
+ */
+static int irdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+			    enum ib_srq_attr_mask attr_mask,
+			    struct ib_udata *udata)
+{
+	struct irdma_device *iwdev = to_iwdev(ibsrq->device);
+	struct irdma_srq *iwsrq = to_iwsrq(ibsrq);
+	struct irdma_cqp_request *cqp_request;
+	struct irdma_pci_f *rf = iwdev->rf;
+	struct irdma_modify_srq_info *info;
+	struct cqp_cmds_info *cqp_info;
+	int status;
+
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (!(attr_mask & IB_SRQ_LIMIT))
+		return 0;
+
+	if (attr->srq_limit > iwsrq->sc_srq.srq_uk.srq_size)
+		return -EINVAL;
+
+	/* Execute this cqp op synchronously, so we can update srq_limit
+	 * upon successful completion.
+	 */
+	cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true);
+	if (!cqp_request)
+		return -ENOMEM;
+
+	cqp_info = &cqp_request->info;
+	info = &cqp_info->in.u.srq_modify.info;
+	info->srq_limit = attr->srq_limit;
+	if (info->srq_limit > 0xFFF)
+		info->srq_limit = 0xFFF;
+	info->arm_limit_event = 1;
+
+	cqp_info->cqp_cmd = IRDMA_OP_SRQ_MODIFY;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.srq_modify.srq = &iwsrq->sc_srq;
+	cqp_info->in.u.srq_modify.scratch = (uintptr_t)cqp_request;
+	status = irdma_handle_cqp_op(rf, cqp_request);
+	irdma_put_cqp_request(&rf->cqp, cqp_request);
+	if (status)
+		return status;
+
+	iwsrq->sc_srq.srq_limit = info->srq_limit;
+
+	return 0;
+}
+
+static int irdma_setup_umode_srq(struct irdma_device *iwdev,
+				 struct irdma_srq *iwsrq,
+				 struct irdma_srq_init_info *info,
+				 struct ib_udata *udata)
+{
+#define IRDMA_CREATE_SRQ_MIN_REQ_LEN \
+	offsetofend(struct irdma_create_srq_req, user_shadow_area)
+	struct irdma_create_srq_req req = {};
+	struct irdma_ucontext *ucontext;
+	struct irdma_srq_mr *srqmr;
+	struct irdma_pbl *iwpbl;
+	unsigned long flags;
+
+	iwsrq->user_mode = true;
+	ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext,
+					     ibucontext);
+
+	if (udata->inlen < IRDMA_CREATE_SRQ_MIN_REQ_LEN)
+		return -EINVAL;
+
+	if (ib_copy_from_udata(&req, udata,
+			       min(sizeof(req), udata->inlen)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags);
+	iwpbl = irdma_get_pbl((unsigned long)req.user_srq_buf,
+			      &ucontext->srq_reg_mem_list);
+	spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags);
+	if (!iwpbl)
+		return -EPROTO;
+
+	iwsrq->iwpbl = iwpbl;
+	srqmr = &iwpbl->srq_mr;
+
+	if (iwpbl->pbl_allocated) {
+		info->virtual_map = true;
+		info->pbl_chunk_size = 1;
+		info->first_pm_pbl_idx = srqmr->srq_pbl.idx;
+		info->leaf_pbl_size = 1;
+	} else {
+		info->srq_pa = srqmr->srq_pbl.addr;
+	}
+	info->shadow_area_pa = srqmr->shadow;
+
+	return 0;
+}
+
+static int irdma_setup_kmode_srq(struct irdma_device *iwdev,
+				 struct irdma_srq *iwsrq,
+				 struct irdma_srq_init_info *info, u32 depth,
+				 u8 shift)
+{
+	struct irdma_srq_uk_init_info *ukinfo = &info->srq_uk_init_info;
+	struct irdma_dma_mem *mem = &iwsrq->kmem;
+	u32 size, ring_size;
+
+	ring_size = depth * IRDMA_QP_WQE_MIN_SIZE;
+	size = ring_size + (IRDMA_SHADOW_AREA_SIZE << 3);
+
+	mem->size = ALIGN(size, 256);
+	mem->va = dma_alloc_coherent(iwdev->rf->hw.device, mem->size,
+				     &mem->pa, GFP_KERNEL);
+	if (!mem->va)
+		return -ENOMEM;
+
+	ukinfo->srq = mem->va;
+	ukinfo->srq_size = depth >> shift;
+	ukinfo->shadow_area = mem->va + ring_size;
+
+	info->shadow_area_pa = info->srq_pa + ring_size;
+	info->srq_pa = mem->pa;
+
+	return 0;
+}
+
+/**
+ * irdma_create_srq - create srq
+ * @ibsrq: ib's srq pointer
+ * @initattrs: attributes for srq
+ * @udata: user data for create srq
+ */
+static int irdma_create_srq(struct ib_srq *ibsrq,
+			    struct ib_srq_init_attr *initattrs,
+			    struct ib_udata *udata)
+{
+	struct irdma_device *iwdev = to_iwdev(ibsrq->device);
+	struct ib_srq_attr *attr = &initattrs->attr;
+	struct irdma_pd *iwpd = to_iwpd(ibsrq->pd);
+	struct irdma_srq *iwsrq = to_iwsrq(ibsrq);
+	struct irdma_srq_uk_init_info *ukinfo;
+	struct irdma_cqp_request *cqp_request;
+	struct irdma_srq_init_info info = {};
+	struct irdma_pci_f *rf = iwdev->rf;
+	struct irdma_uk_attrs *uk_attrs;
+	struct cqp_cmds_info *cqp_info;
+	int err_code = 0;
+	u32 depth;
+	u8 shift;
+
+	uk_attrs = &rf->sc_dev.hw_attrs.uk_attrs;
+	ukinfo = &info.srq_uk_init_info;
+
+	if (initattrs->srq_type != IB_SRQT_BASIC)
+		return -EOPNOTSUPP;
+
+	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_SRQ) ||
+	    attr->max_sge > uk_attrs->max_hw_wq_frags)
+		return -EINVAL;
+
+	refcount_set(&iwsrq->refcnt, 1);
+	spin_lock_init(&iwsrq->lock);
+	err_code = irdma_alloc_rsrc(rf, rf->allocated_srqs, rf->max_srq,
+				    &iwsrq->srq_num, &rf->next_srq);
+	if (err_code)
+		return err_code;
+
+	ukinfo->max_srq_frag_cnt = attr->max_sge;
+	ukinfo->uk_attrs = uk_attrs;
+	ukinfo->srq_id = iwsrq->srq_num;
+
+	irdma_get_wqe_shift(ukinfo->uk_attrs, ukinfo->max_srq_frag_cnt, 0,
+			    &shift);
+
+	err_code = irdma_get_srqdepth(ukinfo->uk_attrs, attr->max_wr,
+				      shift, &depth);
+	if (err_code)
+		return err_code;
+
+	/* Actual SRQ size in WRs for ring and HW */
+	ukinfo->srq_size = depth >> shift;
+
+	/* Max postable WRs to SRQ */
+	iwsrq->max_wr = (depth - IRDMA_RQ_RSVD) >> shift;
+	attr->max_wr = iwsrq->max_wr;
+
+	if (udata)
+		err_code = irdma_setup_umode_srq(iwdev, iwsrq, &info, udata);
+	else
+		err_code = irdma_setup_kmode_srq(iwdev, iwsrq, &info, depth,
+						 shift);
+
+	if (err_code)
+		goto free_rsrc;
+
+	info.vsi = &iwdev->vsi;
+	info.pd = &iwpd->sc_pd;
+
+	err_code = irdma_sc_srq_init(&iwsrq->sc_srq, &info);
+	if (err_code)
+		goto free_dmem;
+
+	cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true);
+	if (!cqp_request) {
+		err_code = -ENOMEM;
+		goto free_dmem;
+	}
+
+	cqp_info = &cqp_request->info;
+	cqp_info->cqp_cmd = IRDMA_OP_SRQ_CREATE;
+	cqp_info->post_sq = 1;
+	cqp_info->in.u.srq_create.srq = &iwsrq->sc_srq;
+	cqp_info->in.u.srq_create.scratch = (uintptr_t)cqp_request;
+	err_code = irdma_handle_cqp_op(rf, cqp_request);
+	irdma_put_cqp_request(&rf->cqp, cqp_request);
+	if (err_code)
+		goto free_dmem;
+
+	if (udata) {
+		struct irdma_create_srq_resp resp = {};
+
+		resp.srq_id = iwsrq->srq_num;
+		resp.srq_size = ukinfo->srq_size;
+		if (ib_copy_to_udata(udata, &resp,
+				     min(sizeof(resp), udata->outlen))) {
+			err_code = -EPROTO;
+			goto srq_destroy;
+		}
+	}
+
+	return 0;
+
+srq_destroy:
+	irdma_srq_wq_destroy(rf, &iwsrq->sc_srq);
+
+free_dmem:
+	if (!iwsrq->user_mode)
+		dma_free_coherent(rf->hw.device, iwsrq->kmem.size,
+				  iwsrq->kmem.va, iwsrq->kmem.pa);
+free_rsrc:
+	irdma_free_rsrc(rf, rf->allocated_srqs, iwsrq->srq_num);
+	return err_code;
+}
+
+/**
+ * irdma_query_srq - get SRQ attributes
+ * @ibsrq: the SRQ to query
+ * @attr: the attributes of the SRQ
+ */
+static int irdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct irdma_srq *iwsrq = to_iwsrq(ibsrq);
+
+	attr->max_wr = iwsrq->max_wr;
+	attr->max_sge = iwsrq->sc_srq.srq_uk.max_srq_frag_cnt;
+	attr->srq_limit = iwsrq->sc_srq.srq_limit;
+
+	return 0;
+}
+
 static inline int cq_validate_flags(u32 flags, u8 hw_rev)
 {
-	/* GEN1 does not support CQ create flags */
-	if (hw_rev == IRDMA_GEN_1)
+	/* GEN1/2 does not support CQ create flags */
+	if (hw_rev <= IRDMA_GEN_2)
 		return flags ? -EOPNOTSUPP : 0;
 
 	return flags & ~IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION ? -EOPNOTSUPP : 0;
@@ -2058,6 +2482,7 @@ static int irdma_create_cq(struct ib_cq *ibcq,
 	unsigned long flags;
 	int err_code;
 	int entries = attr->cqe;
+	bool cqe_64byte_ena;
 
 	err_code = cq_validate_flags(attr->flags, dev->hw_attrs.uk_attrs.hw_rev);
 	if (err_code)
@@ -2081,6 +2506,9 @@ static int irdma_create_cq(struct ib_cq *ibcq,
 	info.dev = dev;
 	ukinfo->cq_size = max(entries, 4);
 	ukinfo->cq_id = cq_num;
+	cqe_64byte_ena = dev->hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_64_BYTE_CQE ?
+			 true : false;
+	ukinfo->avoid_mem_cflct = cqe_64byte_ena;
 	iwcq->ibcq.cqe = info.cq_uk_init_info.cq_size;
 	if (attr->comp_vector < rf->ceqs_count)
 		info.ceq_id = attr->comp_vector;
@@ -2116,8 +2544,6 @@ static int irdma_create_cq(struct ib_cq *ibcq,
 			goto cq_free_rsrc;
 		}
 
-		iwcq->iwpbl = iwpbl;
-		iwcq->cq_mem_size = 0;
 		cqmr = &iwpbl->cq_mr;
 
 		if (rf->sc_dev.hw_attrs.uk_attrs.feature_flags &
@@ -2132,7 +2558,6 @@ static int irdma_create_cq(struct ib_cq *ibcq,
 				err_code = -EPROTO;
 				goto cq_free_rsrc;
 			}
-			iwcq->iwpbl_shadow = iwpbl_shadow;
 			cqmr_shadow = &iwpbl_shadow->cq_mr;
 			info.shadow_area_pa = cqmr_shadow->cq_pbl.addr;
 			cqmr->split = true;
@@ -2156,11 +2581,18 @@ static int irdma_create_cq(struct ib_cq *ibcq,
 		}
 
 		entries++;
-		if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
+		if (!cqe_64byte_ena && dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2)
 			entries *= 2;
+
+		if (entries & 1)
+			entries += 1; /* cq size must be an even number */
+
 		ukinfo->cq_size = entries;
 
-		rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_cqe);
+		if (cqe_64byte_ena)
+			rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_extended_cqe);
+		else
+			rsize = info.cq_uk_init_info.cq_size * sizeof(struct irdma_cqe);
 		iwcq->kmem.size = ALIGN(round_up(rsize, 256), 256);
 		iwcq->kmem.va = dma_alloc_coherent(dev->hw->device,
 						   iwcq->kmem.size,
@@ -2240,8 +2672,9 @@ cq_free_rsrc:
 /**
  * irdma_get_mr_access - get hw MR access permissions from IB access flags
  * @access: IB access flags
+ * @hw_rev: Hardware version
  */
-static inline u16 irdma_get_mr_access(int access)
+static inline u16 irdma_get_mr_access(int access, u8 hw_rev)
 {
 	u16 hw_access = 0;
 
@@ -2251,8 +2684,10 @@ static inline u16 irdma_get_mr_access(int access)
 		     IRDMA_ACCESS_FLAGS_REMOTEWRITE : 0;
 	hw_access |= (access & IB_ACCESS_REMOTE_READ) ?
 		     IRDMA_ACCESS_FLAGS_REMOTEREAD : 0;
-	hw_access |= (access & IB_ACCESS_MW_BIND) ?
-		     IRDMA_ACCESS_FLAGS_BIND_WINDOW : 0;
+	if (hw_rev >= IRDMA_GEN_3) {
+		hw_access |= (access & IB_ACCESS_MW_BIND) ?
+			     IRDMA_ACCESS_FLAGS_BIND_WINDOW : 0;
+	}
 	hw_access |= (access & IB_ZERO_BASED) ?
 		     IRDMA_ACCESS_FLAGS_ZERO_BASED : 0;
 	hw_access |= IRDMA_ACCESS_FLAGS_LOCALREAD;
@@ -2463,6 +2898,7 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev,
 	struct irdma_mr *iwmr = iwpbl->iwmr;
 	struct irdma_qp_mr *qpmr = &iwpbl->qp_mr;
 	struct irdma_cq_mr *cqmr = &iwpbl->cq_mr;
+	struct irdma_srq_mr *srqmr = &iwpbl->srq_mr;
 	struct irdma_hmc_pble *hmc_p;
 	u64 *arr = iwmr->pgaddrmem;
 	u32 pg_size, total;
@@ -2482,7 +2918,10 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev,
 		total = req->sq_pages + req->rq_pages;
 		hmc_p = &qpmr->sq_pbl;
 		qpmr->shadow = (dma_addr_t)arr[total];
-
+		/* Need to use physical address for RQ of QP
+		 * in case it is associated with SRQ.
+		 */
+		qpmr->rq_pa = (dma_addr_t)arr[req->sq_pages];
 		if (lvl) {
 			ret = irdma_check_mem_contiguous(arr, req->sq_pages,
 							 pg_size);
@@ -2502,6 +2941,18 @@ static int irdma_handle_q_mem(struct irdma_device *iwdev,
 			hmc_p->addr = arr[req->sq_pages];
 		}
 		break;
+	case IRDMA_MEMREG_TYPE_SRQ:
+		hmc_p = &srqmr->srq_pbl;
+		srqmr->shadow = (dma_addr_t)arr[req->rq_pages];
+		if (lvl)
+			ret = irdma_check_mem_contiguous(arr, req->rq_pages,
+							 pg_size);
+
+		if (!ret)
+			hmc_p->idx = palloc->level1.idx;
+		else
+			hmc_p->addr = arr[0];
+	break;
 	case IRDMA_MEMREG_TYPE_CQ:
 		hmc_p = &cqmr->cq_pbl;
 
@@ -2806,7 +3257,10 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr,
 	stag_info->stag_idx = iwmr->stag >> IRDMA_CQPSQ_STAG_IDX_S;
 	stag_info->stag_key = (u8)iwmr->stag;
 	stag_info->total_len = iwmr->len;
-	stag_info->access_rights = irdma_get_mr_access(access);
+	stag_info->access_rights = irdma_get_mr_access(access,
+						       iwdev->rf->sc_dev.hw_attrs.uk_attrs.hw_rev);
+	if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.feature_flags & IRDMA_FEATURE_ATOMIC_OPS)
+		stag_info->remote_atomics_en = (access & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
 	stag_info->pd_id = iwpd->sc_pd.pd_id;
 	stag_info->all_memory = pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY;
 	if (stag_info->access_rights & IRDMA_ACCESS_FLAGS_ZERO_BASED)
@@ -2972,6 +3426,37 @@ static int irdma_reg_user_mr_type_qp(struct irdma_mem_reg_req req,
 	return 0;
 }
 
+static int irdma_reg_user_mr_type_srq(struct irdma_mem_reg_req req,
+				      struct ib_udata *udata,
+				      struct irdma_mr *iwmr)
+{
+	struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device);
+	struct irdma_pbl *iwpbl = &iwmr->iwpbl;
+	struct irdma_ucontext *ucontext;
+	unsigned long flags;
+	u32 total;
+	int err;
+	u8 lvl;
+
+	total = req.rq_pages + IRDMA_SHADOW_PGCNT;
+	if (total > iwmr->page_cnt)
+		return -EINVAL;
+
+	lvl = req.rq_pages > 1 ? PBLE_LEVEL_1 : PBLE_LEVEL_0;
+	err = irdma_handle_q_mem(iwdev, &req, iwpbl, lvl);
+	if (err)
+		return err;
+
+	ucontext = rdma_udata_to_drv_context(udata, struct irdma_ucontext,
+					     ibucontext);
+	spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags);
+	list_add_tail(&iwpbl->list, &ucontext->srq_reg_mem_list);
+	iwpbl->on_list = true;
+	spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags);
+
+	return 0;
+}
+
 static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req,
 				     struct ib_udata *udata,
 				     struct irdma_mr *iwmr)
@@ -3063,6 +3548,12 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 			goto error;
 
 		break;
+	case IRDMA_MEMREG_TYPE_SRQ:
+		err = irdma_reg_user_mr_type_srq(req, udata, iwmr);
+		if (err)
+			goto error;
+
+		break;
 	case IRDMA_MEMREG_TYPE_CQ:
 		err = irdma_reg_user_mr_type_cq(req, udata, iwmr);
 		if (err)
@@ -3106,9 +3597,9 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 
 	umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, access);
 	if (IS_ERR(umem_dmabuf)) {
-		err = PTR_ERR(umem_dmabuf);
-		ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
-		return ERR_PTR(err);
+		ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%pe]\n",
+			  umem_dmabuf);
+		return ERR_CAST(umem_dmabuf);
 	}
 
 	iwmr = irdma_alloc_iwmr(&umem_dmabuf->umem, pd, virt, IRDMA_MEMREG_TYPE_MEM);
@@ -3382,6 +3873,14 @@ static void irdma_del_memlist(struct irdma_mr *iwmr,
 		}
 		spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags);
 		break;
+	case IRDMA_MEMREG_TYPE_SRQ:
+		spin_lock_irqsave(&ucontext->srq_reg_mem_list_lock, flags);
+		if (iwpbl->on_list) {
+			iwpbl->on_list = false;
+			list_del(&iwpbl->list);
+		}
+		spin_unlock_irqrestore(&ucontext->srq_reg_mem_list_lock, flags);
+		break;
 	default:
 		break;
 	}
@@ -3461,6 +3960,40 @@ static int irdma_post_send(struct ib_qp *ibqp,
 		if (ib_wr->send_flags & IB_SEND_FENCE)
 			info.read_fence = true;
 		switch (ib_wr->opcode) {
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+			if (unlikely(!(dev->hw_attrs.uk_attrs.feature_flags &
+				       IRDMA_FEATURE_ATOMIC_OPS))) {
+				err = -EINVAL;
+				break;
+			}
+			info.op_type = IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP;
+			info.op.atomic_compare_swap.tagged_offset = ib_wr->sg_list[0].addr;
+			info.op.atomic_compare_swap.remote_tagged_offset =
+				atomic_wr(ib_wr)->remote_addr;
+			info.op.atomic_compare_swap.swap_data_bytes = atomic_wr(ib_wr)->swap;
+			info.op.atomic_compare_swap.compare_data_bytes =
+				atomic_wr(ib_wr)->compare_add;
+			info.op.atomic_compare_swap.stag = ib_wr->sg_list[0].lkey;
+			info.op.atomic_compare_swap.remote_stag = atomic_wr(ib_wr)->rkey;
+			err = irdma_uk_atomic_compare_swap(ukqp, &info, false);
+			break;
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			if (unlikely(!(dev->hw_attrs.uk_attrs.feature_flags &
+				       IRDMA_FEATURE_ATOMIC_OPS))) {
+				err = -EINVAL;
+				break;
+			}
+			info.op_type = IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD;
+			info.op.atomic_fetch_add.tagged_offset = ib_wr->sg_list[0].addr;
+			info.op.atomic_fetch_add.remote_tagged_offset =
+				atomic_wr(ib_wr)->remote_addr;
+			info.op.atomic_fetch_add.fetch_add_data_bytes =
+				atomic_wr(ib_wr)->compare_add;
+			info.op.atomic_fetch_add.stag = ib_wr->sg_list[0].lkey;
+			info.op.atomic_fetch_add.remote_stag =
+				atomic_wr(ib_wr)->rkey;
+			err = irdma_uk_atomic_fetch_add(ukqp, &info, false);
+			break;
 		case IB_WR_SEND_WITH_IMM:
 			if (ukqp->qp_caps & IRDMA_SEND_WITH_IMM) {
 				info.imm_data_valid = true;
@@ -3555,7 +4088,9 @@ static int irdma_post_send(struct ib_qp *ibqp,
 
 			stag_info.signaled = info.signaled;
 			stag_info.read_fence = info.read_fence;
-			stag_info.access_rights = irdma_get_mr_access(reg_wr(ib_wr)->access);
+			stag_info.access_rights =
+				irdma_get_mr_access(reg_wr(ib_wr)->access,
+						    dev->hw_attrs.uk_attrs.hw_rev);
 			stag_info.stag_key = reg_wr(ib_wr)->key & 0xff;
 			stag_info.stag_idx = reg_wr(ib_wr)->key >> 8;
 			stag_info.page_size = reg_wr(ib_wr)->mr->page_size;
@@ -3594,6 +4129,48 @@ static int irdma_post_send(struct ib_qp *ibqp,
 		mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush,
 				 msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS));
 	}
+
+	if (err)
+		*bad_wr = ib_wr;
+
+	return err;
+}
+
+/**
+ * irdma_post_srq_recv - post receive wr for kernel application
+ * @ibsrq: ib srq pointer
+ * @ib_wr: work request for receive
+ * @bad_wr: bad wr caused an error
+ */
+static int irdma_post_srq_recv(struct ib_srq *ibsrq,
+			       const struct ib_recv_wr *ib_wr,
+			       const struct ib_recv_wr **bad_wr)
+{
+	struct irdma_srq *iwsrq = to_iwsrq(ibsrq);
+	struct irdma_srq_uk *uksrq = &iwsrq->sc_srq.srq_uk;
+	struct irdma_post_rq_info post_recv = {};
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&iwsrq->lock, flags);
+	while (ib_wr) {
+		if (ib_wr->num_sge > uksrq->max_srq_frag_cnt) {
+			err = -EINVAL;
+			goto out;
+		}
+		post_recv.num_sges = ib_wr->num_sge;
+		post_recv.wr_id = ib_wr->wr_id;
+		post_recv.sg_list = ib_wr->sg_list;
+		err = irdma_uk_srq_post_receive(uksrq, &post_recv);
+		if (err)
+			goto out;
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	spin_unlock_irqrestore(&iwsrq->lock, flags);
+
 	if (err)
 		*bad_wr = ib_wr;
 
@@ -3619,6 +4196,11 @@ static int irdma_post_recv(struct ib_qp *ibqp,
 	iwqp = to_iwqp(ibqp);
 	ukqp = &iwqp->sc_qp.qp_uk;
 
+	if (ukqp->srq_uk) {
+		*bad_wr = ib_wr;
+		return -EINVAL;
+	}
+
 	spin_lock_irqsave(&iwqp->lock, flags);
 	while (ib_wr) {
 		post_recv.num_sges = ib_wr->num_sge;
@@ -3671,6 +4253,8 @@ static enum ib_wc_status irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode
 		return IB_WC_MW_BIND_ERR;
 	case FLUSH_REM_INV_REQ_ERR:
 		return IB_WC_REM_INV_REQ_ERR;
+	case FLUSH_RNR_RETRY_EXC_ERR:
+		return IB_WC_RNR_RETRY_EXC_ERR;
 	case FLUSH_FATAL_ERR:
 	default:
 		return IB_WC_FATAL_ERR;
@@ -3727,8 +4311,12 @@ static void irdma_process_cqe(struct ib_wc *entry,
 	if (cq_poll_info->q_type == IRDMA_CQE_QTYPE_SQ) {
 		set_ib_wc_op_sq(cq_poll_info, entry);
 	} else {
-		set_ib_wc_op_rq(cq_poll_info, entry,
-				qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM);
+		if (qp->dev->hw_attrs.uk_attrs.hw_rev <= IRDMA_GEN_2)
+			set_ib_wc_op_rq(cq_poll_info, entry,
+					qp->qp_uk.qp_caps & IRDMA_SEND_WITH_IMM ?
+					true : false);
+		else
+			set_ib_wc_op_rq_gen_3(cq_poll_info, entry);
 		if (qp->qp_uk.qp_type != IRDMA_QP_TYPE_ROCE_UD &&
 		    cq_poll_info->stag_invalid_set) {
 			entry->ex.invalidate_rkey = cq_poll_info->inv_stag;
@@ -3923,40 +4511,7 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq,
 	return ret;
 }
 
-static int irdma_roce_port_immutable(struct ib_device *ibdev, u32 port_num,
-				     struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
-	err = ib_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-
-	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-
-	return 0;
-}
-
-static int irdma_iw_port_immutable(struct ib_device *ibdev, u32 port_num,
-				   struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-	err = ib_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-
-	return 0;
-}
-
-static const struct rdma_stat_desc irdma_hw_stat_names[] = {
+static const struct rdma_stat_desc irdma_hw_stat_descs[] = {
 	/* gen1 - 32-bit */
 	[IRDMA_HW_STAT_INDEX_IP4RXDISCARD].name		= "ip4InDiscards",
 	[IRDMA_HW_STAT_INDEX_IP4RXTRUNC].name		= "ip4InTruncatedPkts",
@@ -3964,9 +4519,6 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = {
 	[IRDMA_HW_STAT_INDEX_IP6RXDISCARD].name		= "ip6InDiscards",
 	[IRDMA_HW_STAT_INDEX_IP6RXTRUNC].name		= "ip6InTruncatedPkts",
 	[IRDMA_HW_STAT_INDEX_IP6TXNOROUTE].name		= "ip6OutNoRoutes",
-	[IRDMA_HW_STAT_INDEX_TCPRTXSEG].name		= "tcpRetransSegs",
-	[IRDMA_HW_STAT_INDEX_TCPRXOPTERR].name		= "tcpInOptErrors",
-	[IRDMA_HW_STAT_INDEX_TCPRXPROTOERR].name	= "tcpInProtoErrors",
 	[IRDMA_HW_STAT_INDEX_RXVLANERR].name		= "rxVlanErrors",
 	/* gen1 - 64-bit */
 	[IRDMA_HW_STAT_INDEX_IP4RXOCTS].name		= "ip4InOctets",
@@ -3985,16 +4537,14 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = {
 	[IRDMA_HW_STAT_INDEX_IP6TXPKTS].name		= "ip6OutPkts",
 	[IRDMA_HW_STAT_INDEX_IP6TXFRAGS].name		= "ip6OutSegRqd",
 	[IRDMA_HW_STAT_INDEX_IP6TXMCPKTS].name		= "ip6OutMcastPkts",
-	[IRDMA_HW_STAT_INDEX_TCPRXSEGS].name		= "tcpInSegs",
-	[IRDMA_HW_STAT_INDEX_TCPTXSEG].name		= "tcpOutSegs",
-	[IRDMA_HW_STAT_INDEX_RDMARXRDS].name		= "iwInRdmaReads",
-	[IRDMA_HW_STAT_INDEX_RDMARXSNDS].name		= "iwInRdmaSends",
-	[IRDMA_HW_STAT_INDEX_RDMARXWRS].name		= "iwInRdmaWrites",
-	[IRDMA_HW_STAT_INDEX_RDMATXRDS].name		= "iwOutRdmaReads",
-	[IRDMA_HW_STAT_INDEX_RDMATXSNDS].name		= "iwOutRdmaSends",
-	[IRDMA_HW_STAT_INDEX_RDMATXWRS].name		= "iwOutRdmaWrites",
-	[IRDMA_HW_STAT_INDEX_RDMAVBND].name		= "iwRdmaBnd",
-	[IRDMA_HW_STAT_INDEX_RDMAVINV].name		= "iwRdmaInv",
+	[IRDMA_HW_STAT_INDEX_RDMARXRDS].name		= "InRdmaReads",
+	[IRDMA_HW_STAT_INDEX_RDMARXSNDS].name		= "InRdmaSends",
+	[IRDMA_HW_STAT_INDEX_RDMARXWRS].name		= "InRdmaWrites",
+	[IRDMA_HW_STAT_INDEX_RDMATXRDS].name		= "OutRdmaReads",
+	[IRDMA_HW_STAT_INDEX_RDMATXSNDS].name		= "OutRdmaSends",
+	[IRDMA_HW_STAT_INDEX_RDMATXWRS].name		= "OutRdmaWrites",
+	[IRDMA_HW_STAT_INDEX_RDMAVBND].name		= "RdmaBnd",
+	[IRDMA_HW_STAT_INDEX_RDMAVINV].name		= "RdmaInv",
 
 	/* gen2 - 32-bit */
 	[IRDMA_HW_STAT_INDEX_RXRPCNPHANDLED].name	= "cnpHandled",
@@ -4008,9 +4558,59 @@ static const struct rdma_stat_desc irdma_hw_stat_names[] = {
 	[IRDMA_HW_STAT_INDEX_UDPRXPKTS].name		= "RxUDP",
 	[IRDMA_HW_STAT_INDEX_UDPTXPKTS].name		= "TxUDP",
 	[IRDMA_HW_STAT_INDEX_RXNPECNMARKEDPKTS].name	= "RxECNMrkd",
-
+	[IRDMA_HW_STAT_INDEX_TCPRTXSEG].name		= "RetransSegs",
+	[IRDMA_HW_STAT_INDEX_TCPRXOPTERR].name		= "InOptErrors",
+	[IRDMA_HW_STAT_INDEX_TCPRXPROTOERR].name	= "InProtoErrors",
+	[IRDMA_HW_STAT_INDEX_TCPRXSEGS].name		= "InSegs",
+	[IRDMA_HW_STAT_INDEX_TCPTXSEG].name		= "OutSegs",
+
+	/* gen3 */
+	[IRDMA_HW_STAT_INDEX_RNR_SENT].name		= "RNR sent",
+	[IRDMA_HW_STAT_INDEX_RNR_RCVD].name		= "RNR received",
+	[IRDMA_HW_STAT_INDEX_RDMAORDLMTCNT].name	= "ord limit count",
+	[IRDMA_HW_STAT_INDEX_RDMAIRDLMTCNT].name	= "ird limit count",
+	[IRDMA_HW_STAT_INDEX_RDMARXATS].name		= "Rx atomics",
+	[IRDMA_HW_STAT_INDEX_RDMATXATS].name		= "Tx atomics",
+	[IRDMA_HW_STAT_INDEX_NAKSEQERR].name		= "Nak Sequence Error",
+	[IRDMA_HW_STAT_INDEX_NAKSEQERR_IMPLIED].name	= "Nak Sequence Error Implied",
+	[IRDMA_HW_STAT_INDEX_RTO].name			= "RTO",
+	[IRDMA_HW_STAT_INDEX_RXOOOPKTS].name		= "Rcvd Out of order packets",
+	[IRDMA_HW_STAT_INDEX_ICRCERR].name		= "CRC errors",
 };
 
+static int irdma_roce_port_immutable(struct ib_device *ibdev, u32 port_num,
+				     struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	immutable->pkey_tbl_len = attr.pkey_tbl_len;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
+static int irdma_iw_port_immutable(struct ib_device *ibdev, u32 port_num,
+				   struct ib_port_immutable *immutable)
+{
+	struct ib_port_attr attr;
+	int err;
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+	err = ib_query_port(ibdev, port_num, &attr);
+	if (err)
+		return err;
+	immutable->gid_tbl_len = attr.gid_tbl_len;
+
+	return 0;
+}
+
 static void irdma_get_dev_fw_str(struct ib_device *dev, char *str)
 {
 	struct irdma_device *iwdev = to_iwdev(dev);
@@ -4034,7 +4634,7 @@ static struct rdma_hw_stats *irdma_alloc_hw_port_stats(struct ib_device *ibdev,
 	int num_counters = dev->hw_attrs.max_stat_idx;
 	unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN;
 
-	return rdma_alloc_hw_stats_struct(irdma_hw_stat_names, num_counters,
+	return rdma_alloc_hw_stats_struct(irdma_hw_stat_descs, num_counters,
 					  lifespan);
 }
 
@@ -4539,7 +5139,7 @@ static bool irdma_ah_exists(struct irdma_device *iwdev,
 		  new_ah->sc_ah.ah_info.dest_ip_addr[2] ^
 		  new_ah->sc_ah.ah_info.dest_ip_addr[3];
 
-	hash_for_each_possible(iwdev->ah_hash_tbl, ah, list, key) {
+	hash_for_each_possible(iwdev->rf->ah_hash_tbl, ah, list, key) {
 		/* Set ah_valid and ah_id the same so memcmp can work */
 		new_ah->sc_ah.ah_info.ah_idx = ah->sc_ah.ah_info.ah_idx;
 		new_ah->sc_ah.ah_info.ah_valid = ah->sc_ah.ah_info.ah_valid;
@@ -4565,14 +5165,14 @@ static int irdma_destroy_ah(struct ib_ah *ibah, u32 ah_flags)
 	struct irdma_ah *ah = to_iwah(ibah);
 
 	if ((ah_flags & RDMA_DESTROY_AH_SLEEPABLE) && ah->parent_ah) {
-		mutex_lock(&iwdev->ah_tbl_lock);
+		mutex_lock(&iwdev->rf->ah_tbl_lock);
 		if (!refcount_dec_and_test(&ah->parent_ah->refcnt)) {
-			mutex_unlock(&iwdev->ah_tbl_lock);
+			mutex_unlock(&iwdev->rf->ah_tbl_lock);
 			return 0;
 		}
 		hash_del(&ah->parent_ah->list);
 		kfree(ah->parent_ah);
-		mutex_unlock(&iwdev->ah_tbl_lock);
+		mutex_unlock(&iwdev->rf->ah_tbl_lock);
 	}
 
 	irdma_ah_cqp_op(iwdev->rf, &ah->sc_ah, IRDMA_OP_AH_DESTROY,
@@ -4609,11 +5209,11 @@ static int irdma_create_user_ah(struct ib_ah *ibah,
 	err = irdma_setup_ah(ibah, attr);
 	if (err)
 		return err;
-	mutex_lock(&iwdev->ah_tbl_lock);
+	mutex_lock(&iwdev->rf->ah_tbl_lock);
 	if (!irdma_ah_exists(iwdev, ah)) {
 		err = irdma_create_hw_ah(iwdev, ah, true);
 		if (err) {
-			mutex_unlock(&iwdev->ah_tbl_lock);
+			mutex_unlock(&iwdev->rf->ah_tbl_lock);
 			return err;
 		}
 		/* Add new AH to list */
@@ -4625,11 +5225,11 @@ static int irdma_create_user_ah(struct ib_ah *ibah,
 				  parent_ah->sc_ah.ah_info.dest_ip_addr[3];
 
 			ah->parent_ah = parent_ah;
-			hash_add(iwdev->ah_hash_tbl, &parent_ah->list, key);
+			hash_add(iwdev->rf->ah_hash_tbl, &parent_ah->list, key);
 			refcount_set(&parent_ah->refcnt, 1);
 		}
 	}
-	mutex_unlock(&iwdev->ah_tbl_lock);
+	mutex_unlock(&iwdev->rf->ah_tbl_lock);
 
 	uresp.ah_id = ah->sc_ah.ah_info.ah_idx;
 	err = ib_copy_to_udata(udata, &uresp, min(sizeof(uresp), udata->outlen));
@@ -4691,6 +5291,20 @@ static enum rdma_link_layer irdma_get_link_layer(struct ib_device *ibdev,
 	return IB_LINK_LAYER_ETHERNET;
 }
 
+static const struct ib_device_ops irdma_gen1_dev_ops = {
+	.dealloc_driver = irdma_ib_dealloc_device,
+};
+
+static const struct ib_device_ops irdma_gen3_dev_ops = {
+	.alloc_mw = irdma_alloc_mw,
+	.create_srq = irdma_create_srq,
+	.dealloc_mw = irdma_dealloc_mw,
+	.destroy_srq = irdma_destroy_srq,
+	.modify_srq = irdma_modify_srq,
+	.post_srq_recv = irdma_post_srq_recv,
+	.query_srq = irdma_query_srq,
+};
+
 static const struct ib_device_ops irdma_roce_dev_ops = {
 	.attach_mcast = irdma_attach_mcast,
 	.create_ah = irdma_create_ah,
@@ -4725,7 +5339,6 @@ static const struct ib_device_ops irdma_dev_ops = {
 
 	.alloc_hw_port_stats = irdma_alloc_hw_port_stats,
 	.alloc_mr = irdma_alloc_mr,
-	.alloc_mw = irdma_alloc_mw,
 	.alloc_pd = irdma_alloc_pd,
 	.alloc_ucontext = irdma_alloc_ucontext,
 	.create_cq = irdma_create_cq,
@@ -4761,6 +5374,7 @@ static const struct ib_device_ops irdma_dev_ops = {
 	INIT_RDMA_OBJ_SIZE(ib_cq, irdma_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_mw, irdma_mr, ibmw),
 	INIT_RDMA_OBJ_SIZE(ib_qp, irdma_qp, ibqp),
+	INIT_RDMA_OBJ_SIZE(ib_srq, irdma_srq, ibsrq),
 };
 
 /**
@@ -4808,6 +5422,10 @@ static void irdma_init_rdma_device(struct irdma_device *iwdev)
 	iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count;
 	iwdev->ibdev.dev.parent = &pcidev->dev;
 	ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops);
+	if (iwdev->rf->rdma_ver == IRDMA_GEN_1)
+		ib_set_device_ops(&iwdev->ibdev, &irdma_gen1_dev_ops);
+	if (iwdev->rf->rdma_ver >= IRDMA_GEN_3)
+		ib_set_device_ops(&iwdev->ibdev, &irdma_gen3_dev_ops);
 }
 
 /**
@@ -4879,5 +5497,9 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev)
 	struct irdma_device *iwdev = to_iwdev(ibdev);
 
 	irdma_rt_deinit_hw(iwdev);
-	irdma_ctrl_deinit_hw(iwdev->rf);
+	if (!iwdev->is_vport) {
+		irdma_ctrl_deinit_hw(iwdev->rf);
+		if (iwdev->rf->vchnl_wq)
+			destroy_workqueue(iwdev->rf->vchnl_wq);
+	}
 }
diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h
index cfa140b36395..ed21c1b56e8e 100644
--- a/drivers/infiniband/hw/irdma/verbs.h
+++ b/drivers/infiniband/hw/irdma/verbs.h
@@ -8,6 +8,7 @@
 
 #define IRDMA_PKEY_TBL_SZ		1
 #define IRDMA_DEFAULT_PKEY		0xFFFF
+#define IRDMA_SHADOW_PGCNT		1
 
 struct irdma_ucontext {
 	struct ib_ucontext ibucontext;
@@ -17,6 +18,8 @@ struct irdma_ucontext {
 	spinlock_t cq_reg_mem_list_lock; /* protect CQ memory list */
 	struct list_head qp_reg_mem_list;
 	spinlock_t qp_reg_mem_list_lock; /* protect QP memory list */
+	struct list_head srq_reg_mem_list;
+	spinlock_t srq_reg_mem_list_lock; /* protect SRQ memory list */
 	int abi_ver;
 	u8 legacy_mode : 1;
 	u8 use_raw_attrs : 1;
@@ -65,10 +68,16 @@ struct irdma_cq_mr {
 	bool split;
 };
 
+struct irdma_srq_mr {
+	struct irdma_hmc_pble srq_pbl;
+	dma_addr_t shadow;
+};
+
 struct irdma_qp_mr {
 	struct irdma_hmc_pble sq_pbl;
 	struct irdma_hmc_pble rq_pbl;
 	dma_addr_t shadow;
+	dma_addr_t rq_pa;
 	struct page *sq_page;
 };
 
@@ -85,6 +94,7 @@ struct irdma_pbl {
 	union {
 		struct irdma_qp_mr qp_mr;
 		struct irdma_cq_mr cq_mr;
+		struct irdma_srq_mr srq_mr;
 	};
 
 	bool pbl_allocated:1;
@@ -112,24 +122,33 @@ struct irdma_mr {
 	struct irdma_pbl iwpbl;
 };
 
+struct irdma_srq {
+	struct ib_srq ibsrq;
+	struct irdma_sc_srq sc_srq __aligned(64);
+	struct irdma_dma_mem kmem;
+	u64 *srq_wrid_mem;
+	refcount_t refcnt;
+	spinlock_t lock; /* for poll srq */
+	struct irdma_pbl *iwpbl;
+	struct irdma_sge *sg_list;
+	u16 srq_head;
+	u32 srq_num;
+	u32 max_wr;
+	bool user_mode:1;
+};
+
 struct irdma_cq {
 	struct ib_cq ibcq;
 	struct irdma_sc_cq sc_cq;
-	u16 cq_head;
-	u16 cq_size;
 	u16 cq_num;
 	bool user_mode;
 	atomic_t armed;
 	enum irdma_cmpl_notify last_notify;
-	u32 polled_cmpls;
-	u32 cq_mem_size;
 	struct irdma_dma_mem kmem;
 	struct irdma_dma_mem kmem_shadow;
 	struct completion free_cq;
 	refcount_t refcnt;
 	spinlock_t lock; /* for poll cq */
-	struct irdma_pbl *iwpbl;
-	struct irdma_pbl *iwpbl_shadow;
 	struct list_head resize_list;
 	struct irdma_cq_poll_info cur_cqe;
 	struct list_head cmpl_generated;
@@ -259,6 +278,12 @@ static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info,
 	case IRDMA_OP_TYPE_FAST_REG_NSMR:
 		entry->opcode = IB_WC_REG_MR;
 		break;
+	case IRDMA_OP_TYPE_ATOMIC_COMPARE_AND_SWAP:
+		entry->opcode = IB_WC_COMP_SWAP;
+		break;
+	case IRDMA_OP_TYPE_ATOMIC_FETCH_AND_ADD:
+		entry->opcode = IB_WC_FETCH_ADD;
+		break;
 	case IRDMA_OP_TYPE_INV_STAG:
 		entry->opcode = IB_WC_LOCAL_INV;
 		break;
@@ -267,6 +292,19 @@ static inline void set_ib_wc_op_sq(struct irdma_cq_poll_info *cq_poll_info,
 	}
 }
 
+static inline void set_ib_wc_op_rq_gen_3(struct irdma_cq_poll_info *info,
+					 struct ib_wc *entry)
+{
+	switch (info->op_type) {
+	case IRDMA_OP_TYPE_RDMA_WRITE:
+	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
+		entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		break;
+	default:
+		entry->opcode = IB_WC_RECV;
+	}
+}
+
 static inline void set_ib_wc_op_rq(struct irdma_cq_poll_info *cq_poll_info,
 				   struct ib_wc *entry, bool send_imm_support)
 {
diff --git a/drivers/infiniband/hw/irdma/virtchnl.c b/drivers/infiniband/hw/irdma/virtchnl.c
new file mode 100644
index 000000000000..16ad27247527
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/virtchnl.c
@@ -0,0 +1,618 @@
+// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
+/* Copyright (c) 2015 - 2024 Intel Corporation */
+
+#include "osdep.h"
+#include "hmc.h"
+#include "defs.h"
+#include "type.h"
+#include "protos.h"
+#include "virtchnl.h"
+#include "ws.h"
+#include "i40iw_hw.h"
+#include "ig3rdma_hw.h"
+
+struct vchnl_reg_map_elem {
+	u16 reg_id;
+	u16 reg_idx;
+	bool pg_rel;
+};
+
+struct vchnl_regfld_map_elem {
+	u16 regfld_id;
+	u16 regfld_idx;
+};
+
+static struct vchnl_reg_map_elem vchnl_reg_map[] = {
+	{IRDMA_VCHNL_REG_ID_CQPTAIL, IRDMA_CQPTAIL, false},
+	{IRDMA_VCHNL_REG_ID_CQPDB, IRDMA_CQPDB, false},
+	{IRDMA_VCHNL_REG_ID_CCQPSTATUS, IRDMA_CCQPSTATUS, false},
+	{IRDMA_VCHNL_REG_ID_CCQPHIGH, IRDMA_CCQPHIGH, false},
+	{IRDMA_VCHNL_REG_ID_CCQPLOW, IRDMA_CCQPLOW, false},
+	{IRDMA_VCHNL_REG_ID_CQARM, IRDMA_CQARM, false},
+	{IRDMA_VCHNL_REG_ID_CQACK, IRDMA_CQACK, false},
+	{IRDMA_VCHNL_REG_ID_AEQALLOC, IRDMA_AEQALLOC, false},
+	{IRDMA_VCHNL_REG_ID_CQPERRCODES, IRDMA_CQPERRCODES, false},
+	{IRDMA_VCHNL_REG_ID_WQEALLOC, IRDMA_WQEALLOC, false},
+	{IRDMA_VCHNL_REG_ID_DB_ADDR_OFFSET, IRDMA_DB_ADDR_OFFSET, false },
+	{IRDMA_VCHNL_REG_ID_DYN_CTL, IRDMA_GLINT_DYN_CTL, false },
+	{IRDMA_VCHNL_REG_INV_ID, IRDMA_VCHNL_REG_INV_ID, false }
+};
+
+static struct vchnl_regfld_map_elem vchnl_regfld_map[] = {
+	{IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CQP_OP_ERR, IRDMA_CCQPSTATUS_CCQP_ERR_M},
+	{IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CCQP_DONE, IRDMA_CCQPSTATUS_CCQP_DONE_M},
+	{IRDMA_VCHNL_REGFLD_ID_CQPSQ_STAG_PDID, IRDMA_CQPSQ_STAG_PDID_M},
+	{IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CEQID, IRDMA_CQPSQ_CQ_CEQID_M},
+	{IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CQID, IRDMA_CQPSQ_CQ_CQID_M},
+	{IRDMA_VCHNL_REGFLD_ID_COMMIT_FPM_CQCNT, IRDMA_COMMIT_FPM_CQCNT_M},
+	{IRDMA_VCHNL_REGFLD_ID_UPESD_HMCN_ID, IRDMA_CQPSQ_UPESD_HMCFNID_M},
+	{IRDMA_VCHNL_REGFLD_INV_ID, IRDMA_VCHNL_REGFLD_INV_ID}
+};
+
+#define IRDMA_VCHNL_REG_COUNT ARRAY_SIZE(vchnl_reg_map)
+#define IRDMA_VCHNL_REGFLD_COUNT ARRAY_SIZE(vchnl_regfld_map)
+#define IRDMA_VCHNL_REGFLD_BUF_SIZE \
+	(IRDMA_VCHNL_REG_COUNT * sizeof(struct irdma_vchnl_reg_info) + \
+	 IRDMA_VCHNL_REGFLD_COUNT * sizeof(struct irdma_vchnl_reg_field_info))
+#define IRDMA_REGMAP_RESP_BUF_SIZE (IRDMA_VCHNL_RESP_MIN_SIZE + IRDMA_VCHNL_REGFLD_BUF_SIZE)
+
+/**
+ * irdma_sc_vchnl_init - Initialize dev virtchannel and get hw_rev
+ * @dev: dev structure to update
+ * @info: virtchannel info parameters to fill into the dev structure
+ */
+int irdma_sc_vchnl_init(struct irdma_sc_dev *dev,
+			struct irdma_vchnl_init_info *info)
+{
+	dev->vchnl_up = true;
+	dev->privileged = info->privileged;
+	dev->is_pf = info->is_pf;
+	dev->hw_attrs.uk_attrs.hw_rev = info->hw_rev;
+
+	if (!dev->privileged) {
+		int ret = irdma_vchnl_req_get_ver(dev, IRDMA_VCHNL_CHNL_VER_MAX,
+						  &dev->vchnl_ver);
+
+		ibdev_dbg(to_ibdev(dev),
+			  "DEV: Get Channel version ret = %d, version is %u\n",
+			  ret, dev->vchnl_ver);
+
+		if (ret)
+			return ret;
+
+		ret = irdma_vchnl_req_get_caps(dev);
+		if (ret)
+			return ret;
+
+		dev->hw_attrs.uk_attrs.hw_rev = dev->vc_caps.hw_rev;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_vchnl_req_verify_resp - Verify requested response size
+ * @vchnl_req: vchnl message requested
+ * @resp_len: response length sent from vchnl peer
+ */
+static int irdma_vchnl_req_verify_resp(struct irdma_vchnl_req *vchnl_req,
+				       u16 resp_len)
+{
+	switch (vchnl_req->vchnl_msg->op_code) {
+	case IRDMA_VCHNL_OP_GET_VER:
+	case IRDMA_VCHNL_OP_GET_HMC_FCN:
+	case IRDMA_VCHNL_OP_PUT_HMC_FCN:
+		if (resp_len != vchnl_req->parm_len)
+			return -EBADMSG;
+		break;
+	case IRDMA_VCHNL_OP_GET_RDMA_CAPS:
+		if (resp_len < IRDMA_VCHNL_OP_GET_RDMA_CAPS_MIN_SIZE)
+			return -EBADMSG;
+		break;
+	case IRDMA_VCHNL_OP_GET_REG_LAYOUT:
+	case IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP:
+	case IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP:
+	case IRDMA_VCHNL_OP_ADD_VPORT:
+	case IRDMA_VCHNL_OP_DEL_VPORT:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void irdma_free_vchnl_req_msg(struct irdma_vchnl_req *vchnl_req)
+{
+	kfree(vchnl_req->vchnl_msg);
+}
+
+static int irdma_alloc_vchnl_req_msg(struct irdma_vchnl_req *vchnl_req,
+				     struct irdma_vchnl_req_init_info *info)
+{
+	struct irdma_vchnl_op_buf *vchnl_msg;
+
+	vchnl_msg = kzalloc(IRDMA_VCHNL_MAX_MSG_SIZE, GFP_KERNEL);
+
+	if (!vchnl_msg)
+		return -ENOMEM;
+
+	vchnl_msg->op_ctx = (uintptr_t)vchnl_req;
+	vchnl_msg->buf_len = sizeof(*vchnl_msg) + info->req_parm_len;
+	if (info->req_parm_len)
+		memcpy(vchnl_msg->buf, info->req_parm, info->req_parm_len);
+	vchnl_msg->op_code = info->op_code;
+	vchnl_msg->op_ver = info->op_ver;
+
+	vchnl_req->vchnl_msg = vchnl_msg;
+	vchnl_req->parm = info->resp_parm;
+	vchnl_req->parm_len = info->resp_parm_len;
+
+	return 0;
+}
+
+static int irdma_vchnl_req_send_sync(struct irdma_sc_dev *dev,
+				     struct irdma_vchnl_req_init_info *info)
+{
+	u16 resp_len = sizeof(dev->vc_recv_buf);
+	struct irdma_vchnl_req vchnl_req = {};
+	u16 msg_len;
+	u8 *msg;
+	int ret;
+
+	ret = irdma_alloc_vchnl_req_msg(&vchnl_req, info);
+	if (ret)
+		return ret;
+
+	msg_len = vchnl_req.vchnl_msg->buf_len;
+	msg = (u8 *)vchnl_req.vchnl_msg;
+
+	mutex_lock(&dev->vchnl_mutex);
+	ret = ig3rdma_vchnl_send_sync(dev, msg, msg_len, dev->vc_recv_buf,
+				      &resp_len);
+	dev->vc_recv_len = resp_len;
+	if (ret)
+		goto exit;
+
+	ret = irdma_vchnl_req_get_resp(dev, &vchnl_req);
+exit:
+	mutex_unlock(&dev->vchnl_mutex);
+	ibdev_dbg(to_ibdev(dev),
+		  "VIRT: virtual channel send %s caller: %pS ret=%d op=%u op_ver=%u req_len=%u parm_len=%u resp_len=%u\n",
+		  !ret ? "SUCCEEDS" : "FAILS", __builtin_return_address(0),
+		  ret, vchnl_req.vchnl_msg->op_code,
+		  vchnl_req.vchnl_msg->op_ver, vchnl_req.vchnl_msg->buf_len,
+		  vchnl_req.parm_len, vchnl_req.resp_len);
+	irdma_free_vchnl_req_msg(&vchnl_req);
+
+	return ret;
+}
+
+/**
+ * irdma_vchnl_req_get_reg_layout - Get Register Layout
+ * @dev: RDMA device pointer
+ */
+int irdma_vchnl_req_get_reg_layout(struct irdma_sc_dev *dev)
+{
+	u16 reg_idx, reg_id, tmp_reg_id, regfld_idx, regfld_id, tmp_regfld_id;
+	struct irdma_vchnl_reg_field_info *regfld_array = NULL;
+	u8 resp_buffer[IRDMA_REGMAP_RESP_BUF_SIZE] = {};
+	struct vchnl_regfld_map_elem *regfld_map_array;
+	struct irdma_vchnl_req_init_info info = {};
+	struct vchnl_reg_map_elem *reg_map_array;
+	struct irdma_vchnl_reg_info *reg_array;
+	u8 num_bits, shift_cnt;
+	u16 buf_len = 0;
+	u64 bitmask;
+	u32 rindex;
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_GET_REG_LAYOUT;
+	info.op_ver = IRDMA_VCHNL_OP_GET_REG_LAYOUT_V0;
+	info.resp_parm = resp_buffer;
+	info.resp_parm_len = sizeof(resp_buffer);
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+
+	if (ret)
+		return ret;
+
+	/* parse the response buffer and update reg info*/
+	/* Parse registers till invalid */
+	/* Parse register fields till invalid */
+	reg_array = (struct irdma_vchnl_reg_info *)resp_buffer;
+	for (rindex = 0; rindex < IRDMA_VCHNL_REG_COUNT; rindex++) {
+		buf_len += sizeof(struct irdma_vchnl_reg_info);
+		if (buf_len >= sizeof(resp_buffer))
+			return -ENOMEM;
+
+		regfld_array =
+			(struct irdma_vchnl_reg_field_info *)&reg_array[rindex + 1];
+		reg_id = reg_array[rindex].reg_id;
+		if (reg_id == IRDMA_VCHNL_REG_INV_ID)
+			break;
+
+		reg_id &= ~IRDMA_VCHNL_REG_PAGE_REL;
+		if (reg_id >= IRDMA_VCHNL_REG_COUNT)
+			return -EINVAL;
+
+		/* search regmap for register index in hw_regs.*/
+		reg_map_array = vchnl_reg_map;
+		do {
+			tmp_reg_id = reg_map_array->reg_id;
+			if (tmp_reg_id == reg_id)
+				break;
+
+			reg_map_array++;
+		} while (tmp_reg_id != IRDMA_VCHNL_REG_INV_ID);
+		if (tmp_reg_id != reg_id)
+			continue;
+
+		reg_idx = reg_map_array->reg_idx;
+
+		/* Page relative, DB Offset do not need bar offset */
+		if (reg_idx == IRDMA_DB_ADDR_OFFSET ||
+		    (reg_array[rindex].reg_id & IRDMA_VCHNL_REG_PAGE_REL)) {
+			dev->hw_regs[reg_idx] =
+				(u32 __iomem *)(uintptr_t)reg_array[rindex].reg_offset;
+			continue;
+		}
+
+		/* Update the local HW struct */
+		dev->hw_regs[reg_idx] = ig3rdma_get_reg_addr(dev->hw,
+						reg_array[rindex].reg_offset);
+		if (!dev->hw_regs[reg_idx])
+			return -EINVAL;
+	}
+
+	if (!regfld_array)
+		return -ENOMEM;
+
+	/* set up doorbell variables using mapped DB page */
+	dev->wqe_alloc_db = dev->hw_regs[IRDMA_WQEALLOC];
+	dev->cq_arm_db = dev->hw_regs[IRDMA_CQARM];
+	dev->aeq_alloc_db = dev->hw_regs[IRDMA_AEQALLOC];
+	dev->cqp_db = dev->hw_regs[IRDMA_CQPDB];
+	dev->cq_ack_db = dev->hw_regs[IRDMA_CQACK];
+
+	for (rindex = 0; rindex < IRDMA_VCHNL_REGFLD_COUNT; rindex++) {
+		buf_len += sizeof(struct irdma_vchnl_reg_field_info);
+		if ((buf_len - 1) > sizeof(resp_buffer))
+			break;
+
+		if (regfld_array[rindex].fld_id == IRDMA_VCHNL_REGFLD_INV_ID)
+			break;
+
+		regfld_id = regfld_array[rindex].fld_id;
+		regfld_map_array = vchnl_regfld_map;
+		do {
+			tmp_regfld_id = regfld_map_array->regfld_id;
+			if (tmp_regfld_id == regfld_id)
+				break;
+
+			regfld_map_array++;
+		} while (tmp_regfld_id != IRDMA_VCHNL_REGFLD_INV_ID);
+
+		if (tmp_regfld_id != regfld_id)
+			continue;
+
+		regfld_idx = regfld_map_array->regfld_idx;
+
+		num_bits = regfld_array[rindex].fld_bits;
+		shift_cnt = regfld_array[rindex].fld_shift;
+		if ((num_bits + shift_cnt > 64) || !num_bits) {
+			ibdev_dbg(to_ibdev(dev),
+				  "ERR: Invalid field mask id %d bits %d shift %d",
+				  regfld_id, num_bits, shift_cnt);
+
+			continue;
+		}
+
+		bitmask = (1ULL << num_bits) - 1;
+		dev->hw_masks[regfld_idx] = bitmask << shift_cnt;
+		dev->hw_shifts[regfld_idx] = shift_cnt;
+	}
+
+	return 0;
+}
+
+int irdma_vchnl_req_add_vport(struct irdma_sc_dev *dev, u16 vport_id,
+			      u32 qp1_id, struct irdma_qos *qos)
+{
+	struct irdma_vchnl_resp_vport_info resp_vport = { 0 };
+	struct irdma_vchnl_req_vport_info req_vport = { 0 };
+	struct irdma_vchnl_req_init_info info = { 0 };
+	int ret, i;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_ADD_VPORT;
+	info.op_ver = IRDMA_VCHNL_OP_ADD_VPORT_V0;
+	req_vport.vport_id = vport_id;
+	req_vport.qp1_id = qp1_id;
+	info.req_parm_len = sizeof(req_vport);
+	info.req_parm = &req_vport;
+	info.resp_parm = &resp_vport;
+	info.resp_parm_len = sizeof(resp_vport);
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+	if (ret)
+		return ret;
+
+	for (i = 0;  i < IRDMA_MAX_USER_PRIORITY; i++) {
+		qos[i].qs_handle = resp_vport.qs_handle[i];
+		qos[i].valid = true;
+	}
+
+	return 0;
+}
+
+int irdma_vchnl_req_del_vport(struct irdma_sc_dev *dev, u16 vport_id, u32 qp1_id)
+{
+	struct irdma_vchnl_req_init_info info = { 0 };
+	struct irdma_vchnl_req_vport_info req_vport = { 0 };
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_DEL_VPORT;
+	info.op_ver = IRDMA_VCHNL_OP_DEL_VPORT_V0;
+	req_vport.vport_id = vport_id;
+	req_vport.qp1_id = qp1_id;
+	info.req_parm_len = sizeof(req_vport);
+	info.req_parm = &req_vport;
+
+	return irdma_vchnl_req_send_sync(dev, &info);
+}
+
+/**
+ * irdma_vchnl_req_aeq_vec_map - Map AEQ to vector on this function
+ * @dev: RDMA device pointer
+ * @v_idx: vector index
+ */
+int irdma_vchnl_req_aeq_vec_map(struct irdma_sc_dev *dev, u32 v_idx)
+{
+	struct irdma_vchnl_req_init_info info = {};
+	struct irdma_vchnl_qvlist_info *qvl;
+	struct irdma_vchnl_qv_info *qv;
+	u16 qvl_size, num_vectors = 1;
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	qvl_size = struct_size(qvl, qv_info, num_vectors);
+
+	qvl = kzalloc(qvl_size, GFP_KERNEL);
+	if (!qvl)
+		return -ENOMEM;
+
+	qvl->num_vectors = 1;
+	qv = qvl->qv_info;
+
+	qv->ceq_idx = IRDMA_Q_INVALID_IDX;
+	qv->v_idx = v_idx;
+	qv->itr_idx = IRDMA_IDX_ITR0;
+
+	info.op_code = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP;
+	info.op_ver = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0;
+	info.req_parm = qvl;
+	info.req_parm_len = qvl_size;
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+	kfree(qvl);
+
+	return ret;
+}
+
+/**
+ * irdma_vchnl_req_ceq_vec_map - Map CEQ to vector on this function
+ * @dev: RDMA device pointer
+ * @ceq_id: CEQ index
+ * @v_idx: vector index
+ */
+int irdma_vchnl_req_ceq_vec_map(struct irdma_sc_dev *dev, u16 ceq_id, u32 v_idx)
+{
+	struct irdma_vchnl_req_init_info info = {};
+	struct irdma_vchnl_qvlist_info *qvl;
+	struct irdma_vchnl_qv_info *qv;
+	u16 qvl_size, num_vectors = 1;
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	qvl_size = struct_size(qvl, qv_info, num_vectors);
+
+	qvl = kzalloc(qvl_size, GFP_KERNEL);
+	if (!qvl)
+		return -ENOMEM;
+
+	qvl->num_vectors = num_vectors;
+	qv = qvl->qv_info;
+
+	qv->aeq_idx = IRDMA_Q_INVALID_IDX;
+	qv->ceq_idx = ceq_id;
+	qv->v_idx = v_idx;
+	qv->itr_idx = IRDMA_IDX_ITR0;
+
+	info.op_code = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP;
+	info.op_ver = IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0;
+	info.req_parm = qvl;
+	info.req_parm_len = qvl_size;
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+	kfree(qvl);
+
+	return ret;
+}
+
+/**
+ * irdma_vchnl_req_get_ver - Request Channel version
+ * @dev: RDMA device pointer
+ * @ver_req: Virtual channel version requested
+ * @ver_res: Virtual channel version response
+ */
+int irdma_vchnl_req_get_ver(struct irdma_sc_dev *dev, u16 ver_req, u32 *ver_res)
+{
+	struct irdma_vchnl_req_init_info info = {};
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_GET_VER;
+	info.op_ver = ver_req;
+	info.resp_parm = ver_res;
+	info.resp_parm_len = sizeof(*ver_res);
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+	if (ret)
+		return ret;
+
+	if (*ver_res < IRDMA_VCHNL_CHNL_VER_MIN) {
+		ibdev_dbg(to_ibdev(dev),
+			  "VIRT: %s unsupported vchnl version 0x%0x\n",
+			  __func__, *ver_res);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_vchnl_req_get_hmc_fcn - Request VF HMC Function
+ * @dev: RDMA device pointer
+ */
+int irdma_vchnl_req_get_hmc_fcn(struct irdma_sc_dev *dev)
+{
+	struct irdma_vchnl_req_hmc_info req_hmc = {};
+	struct irdma_vchnl_resp_hmc_info resp_hmc = {};
+	struct irdma_vchnl_req_init_info info = {};
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_GET_HMC_FCN;
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		info.op_ver = IRDMA_VCHNL_OP_GET_HMC_FCN_V2;
+		req_hmc.protocol_used = dev->protocol_used;
+		info.req_parm_len = sizeof(req_hmc);
+		info.req_parm = &req_hmc;
+		info.resp_parm = &resp_hmc;
+		info.resp_parm_len = sizeof(resp_hmc);
+	}
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+
+	if (ret)
+		return ret;
+
+	if (dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_3) {
+		int i;
+
+		dev->hmc_fn_id = resp_hmc.hmc_func;
+
+		for (i = 0;  i < IRDMA_MAX_USER_PRIORITY; i++) {
+			dev->qos[i].qs_handle = resp_hmc.qs_handle[i];
+			dev->qos[i].valid = true;
+		}
+	}
+	return 0;
+}
+
+/**
+ * irdma_vchnl_req_put_hmc_fcn - Free VF HMC Function
+ * @dev: RDMA device pointer
+ */
+int irdma_vchnl_req_put_hmc_fcn(struct irdma_sc_dev *dev)
+{
+	struct irdma_vchnl_req_init_info info = {};
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_PUT_HMC_FCN;
+	info.op_ver = IRDMA_VCHNL_OP_PUT_HMC_FCN_V0;
+
+	return irdma_vchnl_req_send_sync(dev, &info);
+}
+
+/**
+ * irdma_vchnl_req_get_caps - Request RDMA capabilities
+ * @dev: RDMA device pointer
+ */
+int irdma_vchnl_req_get_caps(struct irdma_sc_dev *dev)
+{
+	struct irdma_vchnl_req_init_info info = {};
+	int ret;
+
+	if (!dev->vchnl_up)
+		return -EBUSY;
+
+	info.op_code = IRDMA_VCHNL_OP_GET_RDMA_CAPS;
+	info.op_ver = IRDMA_VCHNL_OP_GET_RDMA_CAPS_V0;
+	info.resp_parm = &dev->vc_caps;
+	info.resp_parm_len = sizeof(dev->vc_caps);
+
+	ret = irdma_vchnl_req_send_sync(dev, &info);
+
+	if (ret)
+		return ret;
+
+	if (dev->vc_caps.hw_rev > IRDMA_GEN_MAX ||
+	    dev->vc_caps.hw_rev < IRDMA_GEN_2) {
+		ibdev_dbg(to_ibdev(dev),
+			  "ERR: %s unsupported hw_rev version 0x%0x\n",
+			  __func__, dev->vc_caps.hw_rev);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * irdma_vchnl_req_get_resp - Receive the inbound vchnl response.
+ * @dev: Dev pointer
+ * @vchnl_req: Vchannel request
+ */
+int irdma_vchnl_req_get_resp(struct irdma_sc_dev *dev,
+			     struct irdma_vchnl_req *vchnl_req)
+{
+	struct irdma_vchnl_resp_buf *vchnl_msg_resp =
+		(struct irdma_vchnl_resp_buf *)dev->vc_recv_buf;
+	u16 resp_len;
+	int ret;
+
+	if ((uintptr_t)vchnl_req != (uintptr_t)vchnl_msg_resp->op_ctx) {
+		ibdev_dbg(to_ibdev(dev),
+			  "VIRT: error vchnl context value does not match\n");
+		return -EBADMSG;
+	}
+
+	resp_len = dev->vc_recv_len - sizeof(*vchnl_msg_resp);
+	resp_len = min(resp_len, vchnl_req->parm_len);
+
+	ret = irdma_vchnl_req_verify_resp(vchnl_req, resp_len);
+	if (ret)
+		return ret;
+
+	ret = (int)vchnl_msg_resp->op_ret;
+	if (ret)
+		return ret;
+
+	vchnl_req->resp_len = 0;
+	if (vchnl_req->parm_len && vchnl_req->parm && resp_len) {
+		memcpy(vchnl_req->parm, vchnl_msg_resp->buf, resp_len);
+		vchnl_req->resp_len = resp_len;
+		ibdev_dbg(to_ibdev(dev), "VIRT: Got response, data size %u\n",
+			  resp_len);
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/irdma/virtchnl.h b/drivers/infiniband/hw/irdma/virtchnl.h
new file mode 100644
index 000000000000..aa955a9125bd
--- /dev/null
+++ b/drivers/infiniband/hw/irdma/virtchnl.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */
+/* Copyright (c) 2015 - 2024 Intel Corporation */
+#ifndef IRDMA_VIRTCHNL_H
+#define IRDMA_VIRTCHNL_H
+
+#include "hmc.h"
+#include "irdma.h"
+
+/* IRDMA_VCHNL_CHNL_VER_V0 is for legacy hw, no longer supported. */
+#define IRDMA_VCHNL_CHNL_VER_V2 2
+#define IRDMA_VCHNL_CHNL_VER_MIN IRDMA_VCHNL_CHNL_VER_V2
+#define IRDMA_VCHNL_CHNL_VER_MAX IRDMA_VCHNL_CHNL_VER_V2
+#define IRDMA_VCHNL_OP_GET_HMC_FCN_V0 0
+#define IRDMA_VCHNL_OP_GET_HMC_FCN_V1 1
+#define IRDMA_VCHNL_OP_GET_HMC_FCN_V2 2
+#define IRDMA_VCHNL_OP_PUT_HMC_FCN_V0 0
+#define IRDMA_VCHNL_OP_GET_REG_LAYOUT_V0 0
+#define IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP_V0 0
+#define IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP_V0 0
+#define IRDMA_VCHNL_OP_ADD_VPORT_V0 0
+#define IRDMA_VCHNL_OP_DEL_VPORT_V0 0
+#define IRDMA_VCHNL_OP_GET_RDMA_CAPS_V0 0
+#define IRDMA_VCHNL_OP_GET_RDMA_CAPS_MIN_SIZE 1
+
+#define IRDMA_VCHNL_REG_ID_CQPTAIL 0
+#define IRDMA_VCHNL_REG_ID_CQPDB 1
+#define IRDMA_VCHNL_REG_ID_CCQPSTATUS 2
+#define IRDMA_VCHNL_REG_ID_CCQPHIGH 3
+#define IRDMA_VCHNL_REG_ID_CCQPLOW 4
+#define IRDMA_VCHNL_REG_ID_CQARM 5
+#define IRDMA_VCHNL_REG_ID_CQACK 6
+#define IRDMA_VCHNL_REG_ID_AEQALLOC 7
+#define IRDMA_VCHNL_REG_ID_CQPERRCODES 8
+#define IRDMA_VCHNL_REG_ID_WQEALLOC 9
+#define IRDMA_VCHNL_REG_ID_IPCONFIG0 10
+#define IRDMA_VCHNL_REG_ID_DB_ADDR_OFFSET 11
+#define IRDMA_VCHNL_REG_ID_DYN_CTL 12
+#define IRDMA_VCHNL_REG_ID_AEQITRMASK 13
+#define IRDMA_VCHNL_REG_ID_CEQITRMASK 14
+#define IRDMA_VCHNL_REG_INV_ID 0xFFFF
+#define IRDMA_VCHNL_REG_PAGE_REL 0x8000
+
+#define IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CQP_OP_ERR 2
+#define IRDMA_VCHNL_REGFLD_ID_CCQPSTATUS_CCQP_DONE 5
+#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_STAG_PDID 6
+#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CEQID 7
+#define IRDMA_VCHNL_REGFLD_ID_CQPSQ_CQ_CQID 8
+#define IRDMA_VCHNL_REGFLD_ID_COMMIT_FPM_CQCNT 9
+#define IRDMA_VCHNL_REGFLD_ID_UPESD_HMCN_ID 10
+#define IRDMA_VCHNL_REGFLD_INV_ID 0xFFFF
+
+#define IRDMA_VCHNL_RESP_MIN_SIZE (sizeof(struct irdma_vchnl_resp_buf))
+
+enum irdma_vchnl_ops {
+	IRDMA_VCHNL_OP_GET_VER = 0,
+	IRDMA_VCHNL_OP_GET_HMC_FCN = 1,
+	IRDMA_VCHNL_OP_PUT_HMC_FCN = 2,
+	IRDMA_VCHNL_OP_GET_REG_LAYOUT = 11,
+	IRDMA_VCHNL_OP_GET_RDMA_CAPS = 13,
+	IRDMA_VCHNL_OP_QUEUE_VECTOR_MAP = 14,
+	IRDMA_VCHNL_OP_QUEUE_VECTOR_UNMAP = 15,
+	IRDMA_VCHNL_OP_ADD_VPORT = 16,
+	IRDMA_VCHNL_OP_DEL_VPORT = 17,
+};
+
+struct irdma_vchnl_req_hmc_info {
+	u8 protocol_used;
+	u8 disable_qos;
+} __packed;
+
+struct irdma_vchnl_resp_hmc_info {
+	u16 hmc_func;
+	u16 qs_handle[IRDMA_MAX_USER_PRIORITY];
+} __packed;
+
+struct irdma_vchnl_qv_info {
+	u32 v_idx;
+	u16 ceq_idx;
+	u16 aeq_idx;
+	u8 itr_idx;
+};
+
+struct irdma_vchnl_qvlist_info {
+	u32 num_vectors;
+	struct irdma_vchnl_qv_info qv_info[];
+};
+
+struct irdma_vchnl_req_vport_info {
+	u16 vport_id;
+	u32 qp1_id;
+};
+
+struct irdma_vchnl_resp_vport_info {
+	u16 qs_handle[IRDMA_MAX_USER_PRIORITY];
+};
+
+struct irdma_vchnl_op_buf {
+	u16 op_code;
+	u16 op_ver;
+	u16 buf_len;
+	u16 rsvd;
+	u64 op_ctx;
+	u8 buf[];
+} __packed;
+
+struct irdma_vchnl_resp_buf {
+	u64 op_ctx;
+	u16 buf_len;
+	s16 op_ret;
+	u16 rsvd[2];
+	u8 buf[];
+} __packed;
+
+struct irdma_vchnl_rdma_caps {
+	u8 hw_rev;
+	u16 cqp_timeout_s;
+	u16 cqp_def_timeout_s;
+	u16 max_hw_push_len;
+} __packed;
+
+struct irdma_vchnl_init_info {
+	struct workqueue_struct *vchnl_wq;
+	enum irdma_vers hw_rev;
+	bool privileged;
+	bool is_pf;
+};
+
+struct irdma_vchnl_reg_info {
+	u32 reg_offset;
+	u16 field_cnt;
+	u16 reg_id; /* High bit of reg_id: bar or page relative */
+};
+
+struct irdma_vchnl_reg_field_info {
+	u8 fld_shift;
+	u8 fld_bits;
+	u16 fld_id;
+};
+
+struct irdma_vchnl_req {
+	struct irdma_vchnl_op_buf *vchnl_msg;
+	void *parm;
+	u32 vf_id;
+	u16 parm_len;
+	u16 resp_len;
+};
+
+struct irdma_vchnl_req_init_info {
+	void *req_parm;
+	void *resp_parm;
+	u16 req_parm_len;
+	u16 resp_parm_len;
+	u16 op_code;
+	u16 op_ver;
+} __packed;
+
+struct irdma_qos;
+
+int irdma_sc_vchnl_init(struct irdma_sc_dev *dev,
+			struct irdma_vchnl_init_info *info);
+int irdma_vchnl_req_get_ver(struct irdma_sc_dev *dev, u16 ver_req,
+			    u32 *ver_res);
+int irdma_vchnl_req_get_hmc_fcn(struct irdma_sc_dev *dev);
+int irdma_vchnl_req_put_hmc_fcn(struct irdma_sc_dev *dev);
+int irdma_vchnl_req_get_caps(struct irdma_sc_dev *dev);
+int irdma_vchnl_req_get_resp(struct irdma_sc_dev *dev,
+			     struct irdma_vchnl_req *vc_req);
+int irdma_vchnl_req_get_reg_layout(struct irdma_sc_dev *dev);
+int irdma_vchnl_req_aeq_vec_map(struct irdma_sc_dev *dev, u32 v_idx);
+int irdma_vchnl_req_ceq_vec_map(struct irdma_sc_dev *dev, u16 ceq_id,
+				u32 v_idx);
+int irdma_vchnl_req_add_vport(struct irdma_sc_dev *dev, u16 vport_id,
+			      u32 qp1_id, struct irdma_qos *qos);
+int irdma_vchnl_req_del_vport(struct irdma_sc_dev *dev, u16 vport_id,
+			      u32 qp1_id);
+#endif /* IRDMA_VIRTCHNL_H */
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 28e154bbb50f..1becc8779123 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -291,6 +291,32 @@ out:
 	return wc_index;
 }
 
+void mana_drain_gsi_sqs(struct mana_ib_dev *mdev)
+{
+	struct mana_ib_qp *qp = mana_get_qp_ref(mdev, MANA_GSI_QPN, false);
+	struct ud_sq_shadow_wqe *shadow_wqe;
+	struct mana_ib_cq *cq;
+	unsigned long flags;
+
+	if (!qp)
+		return;
+
+	cq = container_of(qp->ibqp.send_cq, struct mana_ib_cq, ibcq);
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	while ((shadow_wqe = shadow_queue_get_next_to_complete(&qp->shadow_sq))
+			!= NULL) {
+		shadow_wqe->header.error_code = IB_WC_GENERAL_ERR;
+		shadow_queue_advance_next_to_complete(&qp->shadow_sq);
+	}
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	if (cq->ibcq.comp_handler)
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+	mana_put_qp_ref(qp);
+}
+
 int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 {
 	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
index fa60872f169f..bdeddb642b87 100644
--- a/drivers/infiniband/hw/mana/device.c
+++ b/drivers/infiniband/hw/mana/device.c
@@ -230,6 +230,9 @@ static void mana_ib_remove(struct auxiliary_device *adev)
 {
 	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
 
+	if (mana_ib_is_rnic(dev))
+		mana_drain_gsi_sqs(dev);
+
 	ib_unregister_device(&dev->ib_dev);
 	dma_pool_destroy(dev->av_pool);
 	if (mana_ib_is_rnic(dev)) {
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 6a2471f2e804..fac159f7128d 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -273,9 +273,8 @@ int mana_ib_create_queue(struct mana_ib_dev *mdev, u64 addr, u32 size,
 
 	umem = ib_umem_get(&mdev->ib_dev, addr, size, IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem)) {
-		err = PTR_ERR(umem);
-		ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %d\n", err);
-		return err;
+		ibdev_dbg(&mdev->ib_dev, "Failed to get umem, %pe\n", umem);
+		return PTR_ERR(umem);
 	}
 
 	err = mana_ib_create_zero_offset_dma_region(mdev, umem, &queue->gdma_region);
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 5d31034ac7fb..9d36232ed880 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -43,6 +43,8 @@
  */
 #define MANA_AV_BUFFER_SIZE	64
 
+#define MANA_GSI_QPN		(1)
+
 struct mana_ib_adapter_caps {
 	u32 max_sq_id;
 	u32 max_rq_id;
@@ -410,7 +412,7 @@ struct mana_ib_ah_attr {
 	u8 traffic_class;
 	u16 src_port;
 	u16 dest_port;
-	u32 reserved;
+	u32 flow_label;
 };
 
 struct mana_rnic_set_qp_state_req {
@@ -427,8 +429,15 @@ struct mana_rnic_set_qp_state_req {
 	u32 retry_cnt;
 	u32 rnr_retry;
 	u32 min_rnr_timer;
-	u32 reserved;
+	u32 rate_limit;
 	struct mana_ib_ah_attr ah_attr;
+	u64 reserved1;
+	u32 qkey;
+	u32 qp_access_flags;
+	u8 local_ack_timeout;
+	u8 max_rd_atomic;
+	u16 reserved2;
+	u32 reserved3;
 }; /* HW Data */
 
 struct mana_rnic_set_qp_state_resp {
@@ -718,6 +727,7 @@ int mana_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 int mana_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		      const struct ib_send_wr **bad_wr);
 
+void mana_drain_gsi_sqs(struct mana_ib_dev *mdev);
 int mana_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 55701046ffba..3d0245a4c1ed 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -138,7 +138,8 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		ibdev_dbg(ibdev,
-			  "Failed to get umem for register user-mr, %d\n", err);
+			  "Failed to get umem for register user-mr, %pe\n",
+			  mr->umem);
 		goto err_free;
 	}
 
@@ -220,7 +221,8 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng
 	umem_dmabuf = ib_umem_dmabuf_get_pinned(ibdev, start, length, fd, access_flags);
 	if (IS_ERR(umem_dmabuf)) {
 		err = PTR_ERR(umem_dmabuf);
-		ibdev_dbg(ibdev, "Failed to get dmabuf umem, %d\n", err);
+		ibdev_dbg(ibdev, "Failed to get dmabuf umem, %pe\n",
+			  umem_dmabuf);
 		goto err_free;
 	}
 
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index a6bf4d539e67..48c1f4977f21 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -735,6 +735,8 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	int err;
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp));
+
+	req.hdr.req.msg_version = GDMA_MESSAGE_V3;
 	req.hdr.dev_id = mdev->gdma_dev->dev_id;
 	req.adapter = mdev->adapter_handle;
 	req.qp_handle = qp->qp_handle;
@@ -748,6 +750,12 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	req.retry_cnt = attr->retry_cnt;
 	req.rnr_retry = attr->rnr_retry;
 	req.min_rnr_timer = attr->min_rnr_timer;
+	req.rate_limit = attr->rate_limit;
+	req.qkey = attr->qkey;
+	req.local_ack_timeout = attr->timeout;
+	req.qp_access_flags = attr->qp_access_flags;
+	req.max_rd_atomic = attr->max_rd_atomic;
+
 	if (attr_mask & IB_QP_AV) {
 		ndev = mana_ib_get_netdev(&mdev->ib_dev, ibqp->port);
 		if (!ndev) {
@@ -774,6 +782,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 							  ibqp->qp_num, attr->dest_qp_num);
 		req.ah_attr.traffic_class = attr->ah_attr.grh.traffic_class >> 2;
 		req.ah_attr.hop_limit = attr->ah_attr.grh.hop_limit;
+		req.ah_attr.flow_label = attr->ah_attr.grh.flow_label;
 	}
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index e6e132f10625..91c714f72099 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1836,9 +1836,9 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
 	tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
 	if (IS_ERR(tun_qp->qp)) {
 		ret = PTR_ERR(tun_qp->qp);
+		pr_err("Couldn't create %s QP (%pe)\n",
+		       create_tun ? "tunnel" : "special", tun_qp->qp);
 		tun_qp->qp = NULL;
-		pr_err("Couldn't create %s QP (%d)\n",
-		       create_tun ? "tunnel" : "special", ret);
 		return ret;
 	}
 
@@ -2017,14 +2017,14 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 			       NULL, ctx, &cq_attr);
 	if (IS_ERR(ctx->cq)) {
 		ret = PTR_ERR(ctx->cq);
-		pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+		pr_err("Couldn't create tunnel CQ (%pe)\n", ctx->cq);
 		goto err_buf;
 	}
 
 	ctx->pd = ib_alloc_pd(ctx->ib_dev, 0);
 	if (IS_ERR(ctx->pd)) {
 		ret = PTR_ERR(ctx->pd);
-		pr_err("Couldn't create tunnel PD (%d)\n", ret);
+		pr_err("Couldn't create tunnel PD (%pe)\n", ctx->pd);
 		goto err_cq;
 	}
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 50fd407103c7..f2887ae6390e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1652,7 +1652,8 @@ int mlx4_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 			sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
 
 			if (IS_ERR(sqp->roce_v2_gsi)) {
-				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+				pr_err("Failed to create GSI QP for RoCEv2 (%pe)\n",
+				       sqp->roce_v2_gsi);
 				sqp->roce_v2_gsi = NULL;
 			} else {
 				to_mqp(sqp->roce_v2_gsi)->flags |=
diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c
index b9ba84afaae2..b81ac5709b56 100644
--- a/drivers/infiniband/hw/mlx5/data_direct.c
+++ b/drivers/infiniband/hw/mlx5/data_direct.c
@@ -35,7 +35,7 @@ static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev)
 
 	vpd_data = pci_vpd_alloc(pdev, &vpd_size);
 	if (IS_ERR(vpd_data)) {
-		pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data));
+		pci_err(pdev, "Unable to read VPD, err=%pe\n", vpd_data);
 		return PTR_ERR(vpd_data);
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
index b804f2dd5628..d5487834ed25 100644
--- a/drivers/infiniband/hw/mlx5/gsi.c
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -131,8 +131,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp,
 	gsi->cq = ib_alloc_cq(pd->device, gsi, attr->cap.max_send_wr, 0,
 			      IB_POLL_SOFTIRQ);
 	if (IS_ERR(gsi->cq)) {
-		mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n",
-			     PTR_ERR(gsi->cq));
+		mlx5_ib_warn(dev,
+			     "unable to create send CQ for GSI QP. error %pe\n",
+			     gsi->cq);
 		ret = PTR_ERR(gsi->cq);
 		goto err_free_wrs;
 	}
@@ -147,8 +148,9 @@ int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp,
 
 	gsi->rx_qp = ib_create_qp(pd, &hw_init_attr);
 	if (IS_ERR(gsi->rx_qp)) {
-		mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n",
-			     PTR_ERR(gsi->rx_qp));
+		mlx5_ib_warn(dev,
+			     "unable to create hardware GSI QP. error %pe\n",
+			     gsi->rx_qp);
 		ret = PTR_ERR(gsi->rx_qp);
 		goto err_destroy_cq;
 	}
@@ -294,8 +296,9 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index)
 
 	qp = create_gsi_ud_qp(gsi);
 	if (IS_ERR(qp)) {
-		mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n",
-			     PTR_ERR(qp));
+		mlx5_ib_warn(dev,
+			     "unable to create hardware UD QP for GSI: %pe\n",
+			     qp);
 		return;
 	}
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d456e4fde3e1..fc1e86f6c409 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -13,6 +13,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
+#include <linux/log2.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task.h>
@@ -883,6 +884,51 @@ static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev,
 	resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask();
 }
 
+/*
+ * Calculate maximum SQ overhead across all QP types.
+ * Other QP types (REG_UMR, UC, RC, UD/SMI/GSI, XRC_TGT)
+ * have smaller overhead than the types calculated below,
+ * so they are implicitly included.
+ */
+static u32 mlx5_ib_calc_max_sq_overhead(void)
+{
+	u32 max_overhead_xrc, overhead_ud_lso, a, b;
+
+	/* XRC_INI */
+	max_overhead_xrc = sizeof(struct mlx5_wqe_xrc_seg);
+	max_overhead_xrc += sizeof(struct mlx5_wqe_ctrl_seg);
+	a = sizeof(struct mlx5_wqe_atomic_seg) +
+	    sizeof(struct mlx5_wqe_raddr_seg);
+	b = sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+	    sizeof(struct mlx5_mkey_seg) +
+	    MLX5_IB_SQ_UMR_INLINE_THRESHOLD / MLX5_IB_UMR_OCTOWORD;
+	max_overhead_xrc += max(a, b);
+
+	/* UD with LSO */
+	overhead_ud_lso = sizeof(struct mlx5_wqe_ctrl_seg);
+	overhead_ud_lso += sizeof(struct mlx5_wqe_eth_pad);
+	overhead_ud_lso += sizeof(struct mlx5_wqe_eth_seg);
+	overhead_ud_lso += sizeof(struct mlx5_wqe_datagram_seg);
+
+	return max(max_overhead_xrc, overhead_ud_lso);
+}
+
+static u32 mlx5_ib_calc_max_qp_wr(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	u32 max_wqe_bb_units = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+	u32 max_wqe_size;
+	/* max QP overhead + 1 SGE, no inline, no special features */
+	max_wqe_size = mlx5_ib_calc_max_sq_overhead() +
+		       sizeof(struct mlx5_wqe_data_seg);
+
+	max_wqe_size = roundup_pow_of_two(max_wqe_size);
+
+	max_wqe_size = ALIGN(max_wqe_size, MLX5_SEND_WQE_BB);
+
+	return (max_wqe_bb_units * MLX5_SEND_WQE_BB) / max_wqe_size;
+}
+
 static int mlx5_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props,
 				struct ib_udata *uhw)
@@ -1041,7 +1087,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	props->max_mr_size	   = ~0ull;
 	props->page_size_cap	   = ~(min_page_size - 1);
 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
-	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+	props->max_qp_wr = mlx5_ib_calc_max_qp_wr(dev);
 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
 		     sizeof(struct mlx5_wqe_data_seg);
 	max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
@@ -1793,7 +1839,8 @@ static void deallocate_uars(struct mlx5_ib_dev *dev,
 }
 
 static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master,
-				struct mlx5_core_dev *slave)
+				struct mlx5_core_dev *slave,
+				struct mlx5_ib_lb_state *lb_state)
 {
 	int err;
 
@@ -1805,6 +1852,7 @@ static int mlx5_ib_enable_lb_mp(struct mlx5_core_dev *master,
 	if (err)
 		goto out;
 
+	lb_state->force_enable = true;
 	return 0;
 
 out:
@@ -1813,16 +1861,22 @@ out:
 }
 
 static void mlx5_ib_disable_lb_mp(struct mlx5_core_dev *master,
-				  struct mlx5_core_dev *slave)
+				  struct mlx5_core_dev *slave,
+				  struct mlx5_ib_lb_state *lb_state)
 {
 	mlx5_nic_vport_update_local_lb(slave, false);
 	mlx5_nic_vport_update_local_lb(master, false);
+
+	lb_state->force_enable = false;
 }
 
 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
 {
 	int err = 0;
 
+	if (dev->lb.force_enable)
+		return 0;
+
 	mutex_lock(&dev->lb.mutex);
 	if (td)
 		dev->lb.user_td++;
@@ -1844,6 +1898,9 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
 
 void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
 {
+	if (dev->lb.force_enable)
+		return;
+
 	mutex_lock(&dev->lb.mutex);
 	if (td)
 		dev->lb.user_td--;
@@ -2994,14 +3051,16 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev)
 	pd = ib_alloc_pd(ibdev, 0);
 	if (IS_ERR(pd)) {
 		ret = PTR_ERR(pd);
-		mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret);
+		mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%pe\n",
+			    pd);
 		goto unlock;
 	}
 
 	cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
 	if (IS_ERR(cq)) {
 		ret = PTR_ERR(cq);
-		mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret);
+		mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%pe\n",
+			    cq);
 		ib_dealloc_pd(pd);
 		goto unlock;
 	}
@@ -3045,7 +3104,9 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
 	s0 = ib_create_srq(devr->p0, &attr);
 	if (IS_ERR(s0)) {
 		ret = PTR_ERR(s0);
-		mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret);
+		mlx5_ib_err(dev,
+			    "Couldn't create SRQ 0 for res init, err=%pe\n",
+			    s0);
 		goto unlock;
 	}
 
@@ -3057,7 +3118,9 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
 	s1 = ib_create_srq(devr->p0, &attr);
 	if (IS_ERR(s1)) {
 		ret = PTR_ERR(s1);
-		mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret);
+		mlx5_ib_err(dev,
+			    "Couldn't create SRQ 1 for res init, err=%pe\n",
+			    s1);
 		ib_destroy_srq(s0);
 	}
 
@@ -3118,6 +3181,7 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_core_dev *mdev = dev->mdev;
+	bool ro_supp = false;
 	void *mkc;
 	u32 mkey;
 	u32 pdn;
@@ -3146,14 +3210,37 @@ mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
 	MLX5_SET(mkc, mkc, length64, 1);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 	err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
-	kvfree(in);
 	if (err)
-		goto err;
+		goto err_mkey;
 
 	dev->ddr.mkey = mkey;
 	dev->ddr.pdn = pdn;
+
+	/* create another mkey with RO support */
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) {
+		MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
+		ro_supp = true;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) {
+		MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
+		ro_supp = true;
+	}
+
+	if (ro_supp) {
+		err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
+		/* RO is defined as best effort */
+		if (!err) {
+			dev->ddr.mkey_ro = mkey;
+			dev->ddr.mkey_ro_valid = true;
+		}
+	}
+
+	kvfree(in);
 	return 0;
 
+err_mkey:
+	kvfree(in);
 err:
 	mlx5_core_dealloc_pd(mdev, pdn);
 	return err;
@@ -3162,6 +3249,10 @@ err:
 static void
 mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
 {
+
+	if (dev->ddr.mkey_ro_valid)
+		mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey_ro);
+
 	mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
 	mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
 }
@@ -3523,7 +3614,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 
 	lockdep_assert_held(&mlx5_ib_multiport_mutex);
 
-	mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev);
+	mlx5_ib_disable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb);
 
 	mlx5_core_mp_event_replay(ibdev->mdev,
 				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
@@ -3620,7 +3711,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
 				  &key);
 
-	err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev);
+	err = mlx5_ib_enable_lb_mp(ibdev->mdev, mpi->mdev, &ibdev->lb);
 	if (err)
 		goto unbind;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7ffc7ee92cf0..09d82d5f95e3 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -854,6 +854,8 @@ struct mlx5_ib_port_resources {
 struct mlx5_data_direct_resources {
 	u32 pdn;
 	u32 mkey;
+	u32 mkey_ro;
+	u8 mkey_ro_valid :1;
 };
 
 struct mlx5_ib_resources {
@@ -1109,6 +1111,7 @@ struct mlx5_ib_lb_state {
 	u32			user_td;
 	int			qps;
 	bool			enabled;
+	bool			force_enable;
 };
 
 struct mlx5_ib_pf_eq {
@@ -1802,6 +1805,10 @@ mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem,
 
 	bitmap = GENMASK_ULL(max_log_entity_size_cap, min_log_entity_size_cap);
 
+	/* In KSM mode HW requires IOVA and mkey's page size to be aligned */
+	if (access_mode == MLX5_MKC_ACCESS_MODE_KSM && iova)
+		bitmap &= GENMASK_ULL(__ffs64(iova), 0);
+
 	return ib_umem_find_best_pgsz(umem, bitmap, iova);
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 1317f2cb38a4..325fa04cbe8a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1652,8 +1652,7 @@ reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
 				fd, access_flags);
 
 	if (IS_ERR(umem_dmabuf)) {
-		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
-			    PTR_ERR(umem_dmabuf));
+		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf);
 		return ERR_CAST(umem_dmabuf);
 	}
 
@@ -1717,11 +1716,11 @@ reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
 		goto end;
 	}
 
-	/* The device's 'data direct mkey' was created without RO flags to
-	 * simplify things and allow for a single mkey per device.
-	 * Since RO is not a must, mask it out accordingly.
+	/* If no device's 'data direct mkey' with RO flags exists
+	 * mask it out accordingly.
 	 */
-	access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
+	if (!dev->ddr.mkey_ro_valid)
+		access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
 	crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
 					offset, length, virt_addr, fd,
 					access_flags, MLX5_MKC_ACCESS_MODE_KSM,
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 7ef35cddce81..4e562e0dd9e1 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -761,7 +761,11 @@ _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd,
 
 		if (dd) {
 			cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
-			cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+			if (mr->access_flags & IB_ACCESS_RELAXED_ORDERING &&
+			    dev->ddr.mkey_ro_valid)
+				cur_ksm->key = cpu_to_be32(dev->ddr.mkey_ro);
+			else
+				cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
 			if (mr->umem->is_dmabuf &&
 			    (flags & MLX5_IB_UPD_XLT_ZAP)) {
 				cur_ksm->va = 0;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index e825e2ef7966..134a79eecfcb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -492,7 +492,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
 {
 	u32 i, offset, max_scan, qpn;
 	struct rvt_qpn_map *map;
-	u32 ret;
+	int ret;
 	u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
 		RVT_AIP_QPN_MAX : RVT_QPN_MAX;
 
@@ -510,7 +510,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
 		else
 			qpt->flags |= n;
 		spin_unlock(&qpt->lock);
-		goto bail;
+
+		return ret;
 	}
 
 	qpn = qpt->last + qpt->incr;
@@ -530,7 +531,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
 			if (!test_and_set_bit(offset, map->page)) {
 				qpt->last = qpn;
 				ret = qpn;
-				goto bail;
+
+				return ret;
 			}
 			offset += qpt->incr;
 			/*
@@ -565,10 +567,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
 		qpn = mk_qpn(qpt, map, offset);
 	}
 
-	ret = -ENOMEM;
-
-bail:
-	return ret;
+	return -ENOMEM;
 }
 
 /**
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
index 6f8f353e9583..f522820b950c 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.c
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -132,8 +132,12 @@ static void do_task(struct rxe_task *task)
 		 * yield the cpu and reschedule the task
 		 */
 		if (!ret) {
-			task->state = TASK_STATE_IDLE;
-			resched = 1;
+			if (task->state != TASK_STATE_DRAINING) {
+				task->state = TASK_STATE_IDLE;
+				resched = 1;
+			} else {
+				cont = 1;
+			}
 			goto exit;
 		}
 
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 35c3bde0d00a..efa2f097b582 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -769,7 +769,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 	struct siw_wqe *wqe = tx_wqe(qp);
 
 	unsigned long flags;
-	int rv = 0;
+	int rv = 0, imm_err = 0;
 
 	if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
 		siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
@@ -955,9 +955,17 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
 	 * Send directly if SQ processing is not in progress.
 	 * Eventual immediate errors (rv < 0) do not affect the involved
 	 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
-	 * processing, if new work is already pending. But rv must be passed
-	 * to caller.
+	 * processing, if new work is already pending. But rv and pointer
+	 * to failed work request must be passed to caller.
 	 */
+	if (unlikely(rv < 0)) {
+		/*
+		 * Immediate error
+		 */
+		siw_dbg_qp(qp, "Immediate error %d\n", rv);
+		imm_err = rv;
+		*bad_wr = wr;
+	}
 	if (wqe->wr_status != SIW_WR_IDLE) {
 		spin_unlock_irqrestore(&qp->sq_lock, flags);
 		goto skip_direct_sending;
@@ -982,15 +990,10 @@ skip_direct_sending:
 
 	up_read(&qp->state_lock);
 
-	if (rv >= 0)
-		return 0;
-	/*
-	 * Immediate error
-	 */
-	siw_dbg_qp(qp, "error %d\n", rv);
+	if (unlikely(imm_err))
+		return imm_err;
 
-	*bad_wr = wr;
-	return rv;
+	return (rv >= 0) ? 0 : rv;
 }
 
 /*
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 7acafc5c0e09..5b4d76e97437 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -351,26 +351,27 @@ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
 }
 
 /*
- * Find the master net_device on top of the given net_device.
+ * Find the L2 master net_device on top of the given net_device.
  * @dev: base IPoIB net_device
  *
- * Returns the master net_device with a reference held, or the same net_device
- * if no master exists.
+ * Returns the L2 master net_device with reference held if the L2 master
+ * exists (such as bond netdevice), or returns same netdev with reference
+ * held when master does not exist or when L3 master (such as VRF netdev).
  */
 static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
 {
 	struct net_device *master;
 
 	rcu_read_lock();
+
 	master = netdev_master_upper_dev_get_rcu(dev);
+	if (!master || netif_is_l3_master(master))
+		master = dev;
+
 	dev_hold(master);
 	rcu_read_unlock();
 
-	if (master)
-		return master;
-
-	dev_hold(dev);
-	return dev;
+	return master;
 }
 
 struct ipoib_walk_data {
@@ -522,7 +523,7 @@ static struct net_device *ipoib_get_net_dev_by_params(
 	if (ret)
 		return NULL;
 
-	/* See if we can find a unique device matching the L2 parameters */
+	/* See if we can find a unique device matching the pkey and GID */
 	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 						gid, NULL, &net_dev);
 
@@ -535,7 +536,7 @@ static struct net_device *ipoib_get_net_dev_by_params(
 
 	dev_put(net_dev);
 
-	/* Couldn't find a unique device with L2 parameters only. Use L3
+	/* Couldn't find a unique device with pkey and GID only. Use L3
 	 * address to uniquely match the net device */
 	matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
 						gid, addr, &net_dev);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 5dfb4644446b..71269446353d 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -667,9 +667,9 @@ static int srpt_refresh_port(struct srpt_port *sport)
 						  srpt_mad_recv_handler,
 						  sport, 0);
 		if (IS_ERR(mad_agent)) {
-			pr_err("%s-%d: MAD agent registration failed (%ld). Note: this is expected if SR-IOV is enabled.\n",
+			pr_err("%s-%d: MAD agent registration failed (%pe). Note: this is expected if SR-IOV is enabled.\n",
 			       dev_name(&sport->sdev->device->dev), sport->port,
-			       PTR_ERR(mad_agent));
+			       mad_agent);
 			sport->mad_agent = NULL;
 			memset(&port_modify, 0, sizeof(port_modify));
 			port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
@@ -1865,8 +1865,8 @@ retry:
 				 IB_POLL_WORKQUEUE);
 	if (IS_ERR(ch->cq)) {
 		ret = PTR_ERR(ch->cq);
-		pr_err("failed to create CQ cqe= %d ret= %d\n",
-		       ch->rq_size + sq_size, ret);
+		pr_err("failed to create CQ cqe= %d ret= %pe\n",
+		       ch->rq_size + sq_size, ch->cq);
 		goto out;
 	}
 	ch->cq_size = ch->rq_size + sq_size;
@@ -3132,7 +3132,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev)
 	WARN_ON_ONCE(sdev->srq);
 	srq = ib_create_srq(sdev->pd, &srq_attr);
 	if (IS_ERR(srq)) {
-		pr_debug("ib_create_srq() failed: %ld\n", PTR_ERR(srq));
+		pr_debug("ib_create_srq() failed: %pe\n", srq);
 		return PTR_ERR(srq);
 	}
 
@@ -3236,8 +3236,7 @@ static int srpt_add_one(struct ib_device *device)
 	if (rdma_port_get_link_layer(device, 1) == IB_LINK_LAYER_INFINIBAND)
 		sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev);
 	if (IS_ERR(sdev->cm_id)) {
-		pr_info("ib_create_cm_id() failed: %ld\n",
-			PTR_ERR(sdev->cm_id));
+		pr_info("ib_create_cm_id() failed: %pe\n", sdev->cm_id);
 		ret = PTR_ERR(sdev->cm_id);
 		sdev->cm_id = NULL;
 		if (!rdma_cm_id)
@@ -3687,8 +3686,7 @@ static struct rdma_cm_id *srpt_create_rdma_id(struct sockaddr *listen_addr)
 	rdma_cm_id = rdma_create_id(&init_net, srpt_rdma_cm_handler,
 				    NULL, RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(rdma_cm_id)) {
-		pr_err("RDMA/CM ID creation failed: %ld\n",
-		       PTR_ERR(rdma_cm_id));
+		pr_err("RDMA/CM ID creation failed: %pe\n", rdma_cm_id);
 		goto out;
 	}
 
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 95f63c5f6159..a698a2e7ce2a 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -792,6 +792,11 @@ struct amd_iommu {
 	u32 flags;
 	volatile u64 *cmd_sem;
 	atomic64_t cmd_sem_val;
+	/*
+	 * Track physical address to directly use it in build_completion_wait()
+	 * and avoid adding any special checks and handling for kdump.
+	 */
+	u64 cmd_sem_paddr;
 
 #ifdef CONFIG_AMD_IOMMU_DEBUGFS
 	/* DebugFS Info */
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index ba9e582a8bbe..f2991c11867c 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -406,6 +406,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->mmio_base == NULL);
 
+	if (is_kdump_kernel())
+		return;
+
 	entry = iommu_virt_to_phys(dev_table);
 	entry |= (dev_table_size >> 12) - 1;
 	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
@@ -646,7 +649,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg)
 
 static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg)
 {
-	iommu_free_pages(pci_seg->dev_table);
+	if (is_kdump_kernel())
+		memunmap((void *)pci_seg->dev_table);
+	else
+		iommu_free_pages(pci_seg->dev_table);
 	pci_seg->dev_table = NULL;
 }
 
@@ -710,6 +716,26 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg)
 	pci_seg->alias_table = NULL;
 }
 
+static inline void *iommu_memremap(unsigned long paddr, size_t size)
+{
+	phys_addr_t phys;
+
+	if (!paddr)
+		return NULL;
+
+	/*
+	 * Obtain true physical address in kdump kernel when SME is enabled.
+	 * Currently, previous kernel with SME enabled and kdump kernel
+	 * with SME support disabled is not supported.
+	 */
+	phys = __sme_clr(paddr);
+
+	if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+		return (__force void *)ioremap_encrypted(phys, size);
+	else
+		return memremap(phys, size, MEMREMAP_WB);
+}
+
 /*
  * Allocates the command buffer. This buffer is per AMD IOMMU. We can
  * write commands to that buffer later and the IOMMU will execute them
@@ -795,11 +821,16 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->cmd_buf == NULL);
 
-	entry = iommu_virt_to_phys(iommu->cmd_buf);
-	entry |= MMIO_CMD_SIZE_512;
-
-	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-		    &entry, sizeof(entry));
+	if (!is_kdump_kernel()) {
+		/*
+		 * Command buffer is re-used for kdump kernel and setting
+		 * of MMIO register is not required.
+		 */
+		entry = iommu_virt_to_phys(iommu->cmd_buf);
+		entry |= MMIO_CMD_SIZE_512;
+		memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+			    &entry, sizeof(entry));
+	}
 
 	amd_iommu_reset_cmd_buffer(iommu);
 }
@@ -850,10 +881,15 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->evt_buf == NULL);
 
-	entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
-	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
-		    &entry, sizeof(entry));
+	if (!is_kdump_kernel()) {
+		/*
+		 * Event buffer is re-used for kdump kernel and setting
+		 * of MMIO register is not required.
+		 */
+		entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+		memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+			    &entry, sizeof(entry));
+	}
 
 	/* set head and tail to zero manually */
 	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
@@ -942,8 +978,91 @@ err_out:
 static int __init alloc_cwwb_sem(struct amd_iommu *iommu)
 {
 	iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1);
+	if (!iommu->cmd_sem)
+		return -ENOMEM;
+	iommu->cmd_sem_paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
+	return 0;
+}
+
+static int __init remap_event_buffer(struct amd_iommu *iommu)
+{
+	u64 paddr;
+
+	pr_info_once("Re-using event buffer from the previous kernel\n");
+	paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK;
+	iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE);
+
+	return iommu->evt_buf ? 0 : -ENOMEM;
+}
+
+static int __init remap_command_buffer(struct amd_iommu *iommu)
+{
+	u64 paddr;
+
+	pr_info_once("Re-using command buffer from the previous kernel\n");
+	paddr = readq(iommu->mmio_base + MMIO_CMD_BUF_OFFSET) & PM_ADDR_MASK;
+	iommu->cmd_buf = iommu_memremap(paddr, CMD_BUFFER_SIZE);
+
+	return iommu->cmd_buf ? 0 : -ENOMEM;
+}
+
+static int __init remap_or_alloc_cwwb_sem(struct amd_iommu *iommu)
+{
+	u64 paddr;
+
+	if (check_feature(FEATURE_SNP)) {
+		/*
+		 * When SNP is enabled, the exclusion base register is used for the
+		 * completion wait buffer (CWB) address. Read and re-use it.
+		 */
+		pr_info_once("Re-using CWB buffers from the previous kernel\n");
+		paddr = readq(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET) & PM_ADDR_MASK;
+		iommu->cmd_sem = iommu_memremap(paddr, PAGE_SIZE);
+		if (!iommu->cmd_sem)
+			return -ENOMEM;
+		iommu->cmd_sem_paddr = paddr;
+	} else {
+		return alloc_cwwb_sem(iommu);
+	}
+
+	return 0;
+}
+
+static int __init alloc_iommu_buffers(struct amd_iommu *iommu)
+{
+	int ret;
+
+	/*
+	 * Reuse/Remap the previous kernel's allocated completion wait
+	 * command and event buffers for kdump boot.
+	 */
+	if (is_kdump_kernel()) {
+		ret = remap_or_alloc_cwwb_sem(iommu);
+		if (ret)
+			return ret;
+
+		ret = remap_command_buffer(iommu);
+		if (ret)
+			return ret;
+
+		ret = remap_event_buffer(iommu);
+		if (ret)
+			return ret;
+	} else {
+		ret = alloc_cwwb_sem(iommu);
+		if (ret)
+			return ret;
 
-	return iommu->cmd_sem ? 0 : -ENOMEM;
+		ret = alloc_command_buffer(iommu);
+		if (ret)
+			return ret;
+
+		ret = alloc_event_buffer(iommu);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 static void __init free_cwwb_sem(struct amd_iommu *iommu)
@@ -951,6 +1070,38 @@ static void __init free_cwwb_sem(struct amd_iommu *iommu)
 	if (iommu->cmd_sem)
 		iommu_free_pages((void *)iommu->cmd_sem);
 }
+static void __init unmap_cwwb_sem(struct amd_iommu *iommu)
+{
+	if (iommu->cmd_sem) {
+		if (check_feature(FEATURE_SNP))
+			memunmap((void *)iommu->cmd_sem);
+		else
+			iommu_free_pages((void *)iommu->cmd_sem);
+	}
+}
+
+static void __init unmap_command_buffer(struct amd_iommu *iommu)
+{
+	memunmap((void *)iommu->cmd_buf);
+}
+
+static void __init unmap_event_buffer(struct amd_iommu *iommu)
+{
+	memunmap(iommu->evt_buf);
+}
+
+static void __init free_iommu_buffers(struct amd_iommu *iommu)
+{
+	if (is_kdump_kernel()) {
+		unmap_cwwb_sem(iommu);
+		unmap_command_buffer(iommu);
+		unmap_event_buffer(iommu);
+	} else {
+		free_cwwb_sem(iommu);
+		free_command_buffer(iommu);
+		free_event_buffer(iommu);
+	}
+}
 
 static void iommu_enable_xt(struct amd_iommu *iommu)
 {
@@ -982,15 +1133,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit)
 	dte->data[i] |= (1UL << _bit);
 }
 
-static bool __copy_device_table(struct amd_iommu *iommu)
+static bool __reuse_device_table(struct amd_iommu *iommu)
 {
-	u64 int_ctl, int_tab_len, entry = 0;
 	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
-	struct dev_table_entry *old_devtb = NULL;
-	u32 lo, hi, devid, old_devtb_size;
+	u32 lo, hi, old_devtb_size;
 	phys_addr_t old_devtb_phys;
-	u16 dom_id, dte_v, irq_v;
-	u64 tmp;
+	u64 entry;
 
 	/* Each IOMMU use separate device table with the same size */
 	lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET);
@@ -1015,66 +1163,20 @@ static bool __copy_device_table(struct amd_iommu *iommu)
 		pr_err("The address of old device table is above 4G, not trustworthy!\n");
 		return false;
 	}
-	old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel())
-		    ? (__force void *)ioremap_encrypted(old_devtb_phys,
-							pci_seg->dev_table_size)
-		    : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB);
-
-	if (!old_devtb)
-		return false;
 
-	pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz(
-		GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size);
+	/*
+	 * Re-use the previous kernel's device table for kdump.
+	 */
+	pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size);
 	if (pci_seg->old_dev_tbl_cpy == NULL) {
-		pr_err("Failed to allocate memory for copying old device table!\n");
-		memunmap(old_devtb);
+		pr_err("Failed to remap memory for reusing old device table!\n");
 		return false;
 	}
 
-	for (devid = 0; devid <= pci_seg->last_bdf; ++devid) {
-		pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid];
-		dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK;
-		dte_v = old_devtb[devid].data[0] & DTE_FLAG_V;
-
-		if (dte_v && dom_id) {
-			pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0];
-			pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1];
-			/* Reserve the Domain IDs used by previous kernel */
-			if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) {
-				pr_err("Failed to reserve domain ID 0x%x\n", dom_id);
-				memunmap(old_devtb);
-				return false;
-			}
-			/* If gcr3 table existed, mask it out */
-			if (old_devtb[devid].data[0] & DTE_FLAG_GV) {
-				tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31);
-				pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp;
-				tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV);
-				pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp;
-			}
-		}
-
-		irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE;
-		int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK;
-		int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK;
-		if (irq_v && (int_ctl || int_tab_len)) {
-			if ((int_ctl != DTE_IRQ_REMAP_INTCTL) ||
-			    (int_tab_len != DTE_INTTABLEN_512 &&
-			     int_tab_len != DTE_INTTABLEN_2K)) {
-				pr_err("Wrong old irq remapping flag: %#x\n", devid);
-				memunmap(old_devtb);
-				return false;
-			}
-
-			pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2];
-		}
-	}
-	memunmap(old_devtb);
-
 	return true;
 }
 
-static bool copy_device_table(void)
+static bool reuse_device_table(void)
 {
 	struct amd_iommu *iommu;
 	struct amd_iommu_pci_seg *pci_seg;
@@ -1082,17 +1184,17 @@ static bool copy_device_table(void)
 	if (!amd_iommu_pre_enabled)
 		return false;
 
-	pr_warn("Translation is already enabled - trying to copy translation structures\n");
+	pr_warn("Translation is already enabled - trying to reuse translation structures\n");
 
 	/*
 	 * All IOMMUs within PCI segment shares common device table.
-	 * Hence copy device table only once per PCI segment.
+	 * Hence reuse device table only once per PCI segment.
 	 */
 	for_each_pci_segment(pci_seg) {
 		for_each_iommu(iommu) {
 			if (pci_seg->id != iommu->pci_seg->id)
 				continue;
-			if (!__copy_device_table(iommu))
+			if (!__reuse_device_table(iommu))
 				return false;
 			break;
 		}
@@ -1655,9 +1757,7 @@ static void __init free_sysfs(struct amd_iommu *iommu)
 static void __init free_iommu_one(struct amd_iommu *iommu)
 {
 	free_sysfs(iommu);
-	free_cwwb_sem(iommu);
-	free_command_buffer(iommu);
-	free_event_buffer(iommu);
+	free_iommu_buffers(iommu);
 	amd_iommu_free_ppr_log(iommu);
 	free_ga_log(iommu);
 	iommu_unmap_mmio_space(iommu);
@@ -1821,14 +1921,9 @@ static int __init init_iommu_one_late(struct amd_iommu *iommu)
 {
 	int ret;
 
-	if (alloc_cwwb_sem(iommu))
-		return -ENOMEM;
-
-	if (alloc_command_buffer(iommu))
-		return -ENOMEM;
-
-	if (alloc_event_buffer(iommu))
-		return -ENOMEM;
+	ret = alloc_iommu_buffers(iommu);
+	if (ret)
+		return ret;
 
 	iommu->int_enabled = false;
 
@@ -2778,8 +2873,8 @@ static void early_enable_iommu(struct amd_iommu *iommu)
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized.
  *
- * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy
- * the old content of device table entries. Not this case or copy failed,
+ * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse
+ * the old content of device table entries. Not this case or reuse failed,
  * just continue as normal kernel does.
  */
 static void early_enable_iommus(void)
@@ -2787,18 +2882,25 @@ static void early_enable_iommus(void)
 	struct amd_iommu *iommu;
 	struct amd_iommu_pci_seg *pci_seg;
 
-	if (!copy_device_table()) {
+	if (!reuse_device_table()) {
 		/*
-		 * If come here because of failure in copying device table from old
+		 * If come here because of failure in reusing device table from old
 		 * kernel with all IOMMUs enabled, print error message and try to
 		 * free allocated old_dev_tbl_cpy.
 		 */
-		if (amd_iommu_pre_enabled)
-			pr_err("Failed to copy DEV table from previous kernel.\n");
+		if (amd_iommu_pre_enabled) {
+			pr_err("Failed to reuse DEV table from previous kernel.\n");
+			/*
+			 * Bail out early if unable to remap/reuse DEV table from
+			 * previous kernel if SNP enabled as IOMMU commands will
+			 * time out without DEV table and cause kdump boot panic.
+			 */
+			BUG_ON(check_feature(FEATURE_SNP));
+		}
 
 		for_each_pci_segment(pci_seg) {
 			if (pci_seg->old_dev_tbl_cpy != NULL) {
-				iommu_free_pages(pci_seg->old_dev_tbl_cpy);
+				memunmap((void *)pci_seg->old_dev_tbl_cpy);
 				pci_seg->old_dev_tbl_cpy = NULL;
 			}
 		}
@@ -2808,7 +2910,7 @@ static void early_enable_iommus(void)
 			early_enable_iommu(iommu);
 		}
 	} else {
-		pr_info("Copied DEV table from previous kernel.\n");
+		pr_info("Reused DEV table from previous kernel.\n");
 
 		for_each_pci_segment(pci_seg) {
 			iommu_free_pages(pci_seg->dev_table);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index eb348c63a8d0..2e1865daa1ce 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -14,6 +14,7 @@
 #include <linux/pci-ats.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
+#include <linux/string_choices.h>
 #include <linux/debugfs.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-map-ops.h>
@@ -265,7 +266,7 @@ static inline int get_acpihid_device_id(struct device *dev,
 		return -EINVAL;
 	if (fw_bug)
 		dev_err_once(dev, FW_BUG "No ACPI device matched UID, but %d device%s matched HID.\n",
-			     hid_count, hid_count > 1 ? "s" : "");
+			     hid_count, str_plural(hid_count));
 	if (hid_count > 1)
 		return -EINVAL;
 	if (entry)
@@ -1195,7 +1196,7 @@ static void build_completion_wait(struct iommu_cmd *cmd,
 				  struct amd_iommu *iommu,
 				  u64 data)
 {
-	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
+	u64 paddr = iommu->cmd_sem_paddr;
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index 190f28d76615..95a4e62b8f63 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -122,6 +122,8 @@
 #define DART_T8110_ERROR_ADDR_LO 0x170
 #define DART_T8110_ERROR_ADDR_HI 0x174
 
+#define DART_T8110_ERROR_STREAMS 0x1c0
+
 #define DART_T8110_PROTECT 0x200
 #define DART_T8110_UNPROTECT 0x204
 #define DART_T8110_PROTECT_LOCK 0x208
@@ -133,6 +135,7 @@
 #define DART_T8110_TCR                  0x1000
 #define DART_T8110_TCR_REMAP            GENMASK(11, 8)
 #define DART_T8110_TCR_REMAP_EN         BIT(7)
+#define DART_T8110_TCR_FOUR_LEVEL       BIT(3)
 #define DART_T8110_TCR_BYPASS_DAPF      BIT(2)
 #define DART_T8110_TCR_BYPASS_DART      BIT(1)
 #define DART_T8110_TCR_TRANSLATE_ENABLE BIT(0)
@@ -166,22 +169,23 @@ struct apple_dart_hw {
 
 	int max_sid_count;
 
-	u64 lock;
-	u64 lock_bit;
+	u32 lock;
+	u32 lock_bit;
 
-	u64 error;
+	u32 error;
 
-	u64 enable_streams;
+	u32 enable_streams;
 
-	u64 tcr;
-	u64 tcr_enabled;
-	u64 tcr_disabled;
-	u64 tcr_bypass;
+	u32 tcr;
+	u32 tcr_enabled;
+	u32 tcr_disabled;
+	u32 tcr_bypass;
+	u32 tcr_4level;
 
-	u64 ttbr;
-	u64 ttbr_valid;
-	u64 ttbr_addr_field_shift;
-	u64 ttbr_shift;
+	u32 ttbr;
+	u32 ttbr_valid;
+	u32 ttbr_addr_field_shift;
+	u32 ttbr_shift;
 	int ttbr_count;
 };
 
@@ -217,6 +221,7 @@ struct apple_dart {
 	u32 pgsize;
 	u32 num_streams;
 	u32 supports_bypass : 1;
+	u32 four_level : 1;
 
 	struct iommu_group *sid2group[DART_MAX_STREAMS];
 	struct iommu_device iommu;
@@ -305,13 +310,19 @@ static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom)
 }
 
 static void
-apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map)
+apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map, int levels)
 {
 	struct apple_dart *dart = stream_map->dart;
+	u32 tcr = dart->hw->tcr_enabled;
 	int sid;
 
+	if (levels == 4)
+		tcr |= dart->hw->tcr_4level;
+
+	WARN_ON(levels != 3 && levels != 4);
+	WARN_ON(levels == 4 && !dart->four_level);
 	for_each_set_bit(sid, stream_map->sidmap, dart->num_streams)
-		writel(dart->hw->tcr_enabled, dart->regs + DART_TCR(dart, sid));
+		writel(tcr, dart->regs + DART_TCR(dart, sid));
 }
 
 static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map)
@@ -569,7 +580,8 @@ apple_dart_setup_translation(struct apple_dart_domain *domain,
 	for (; i < stream_map->dart->hw->ttbr_count; ++i)
 		apple_dart_hw_clear_ttbr(stream_map, i);
 
-	apple_dart_hw_enable_translation(stream_map);
+	apple_dart_hw_enable_translation(stream_map,
+					 pgtbl_cfg->apple_dart_cfg.n_levels);
 	stream_map->dart->hw->invalidate_tlb(stream_map);
 }
 
@@ -614,7 +626,7 @@ static int apple_dart_finalize_domain(struct apple_dart_domain *dart_domain,
 	dart_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
 	dart_domain->domain.geometry.aperture_start = 0;
 	dart_domain->domain.geometry.aperture_end =
-		(dma_addr_t)DMA_BIT_MASK(dart->ias);
+		(dma_addr_t)DMA_BIT_MASK(pgtbl_cfg.ias);
 	dart_domain->domain.geometry.force_aperture = true;
 
 	dart_domain->finalized = true;
@@ -807,6 +819,8 @@ static int apple_dart_of_xlate(struct device *dev,
 	if (cfg_dart) {
 		if (cfg_dart->pgsize != dart->pgsize)
 			return -EINVAL;
+		if (cfg_dart->ias != dart->ias)
+			return -EINVAL;
 	}
 
 	cfg->supports_bypass &= dart->supports_bypass;
@@ -1077,6 +1091,9 @@ static irqreturn_t apple_dart_t8110_irq(int irq, void *dev)
 		error, stream_idx, error_code, fault_name, addr);
 
 	writel(error, dart->regs + DART_T8110_ERROR);
+	for (int i = 0; i < BITS_TO_U32(dart->num_streams); i++)
+		writel(U32_MAX, dart->regs + DART_T8110_ERROR_STREAMS + 4 * i);
+
 	return IRQ_HANDLED;
 }
 
@@ -1137,6 +1154,7 @@ static int apple_dart_probe(struct platform_device *pdev)
 		dart->ias = FIELD_GET(DART_T8110_PARAMS3_VA_WIDTH, dart_params[2]);
 		dart->oas = FIELD_GET(DART_T8110_PARAMS3_PA_WIDTH, dart_params[2]);
 		dart->num_streams = FIELD_GET(DART_T8110_PARAMS4_NUM_SIDS, dart_params[3]);
+		dart->four_level = dart->ias > 36;
 		break;
 	}
 
@@ -1169,9 +1187,9 @@ static int apple_dart_probe(struct platform_device *pdev)
 
 	dev_info(
 		&pdev->dev,
-		"DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d] initialized\n",
+		"DART [pagesize %x, %d streams, bypass support: %d, bypass forced: %d, AS %d -> %d] initialized\n",
 		dart->pgsize, dart->num_streams, dart->supports_bypass,
-		dart->pgsize > PAGE_SIZE);
+		dart->pgsize > PAGE_SIZE, dart->ias, dart->oas);
 	return 0;
 
 err_sysfs_remove:
@@ -1292,6 +1310,7 @@ static const struct apple_dart_hw apple_dart_hw_t8110 = {
 	.tcr_enabled = DART_T8110_TCR_TRANSLATE_ENABLE,
 	.tcr_disabled = 0,
 	.tcr_bypass = DART_T8110_TCR_BYPASS_DAPF | DART_T8110_TCR_BYPASS_DART,
+	.tcr_4level = DART_T8110_TCR_FOUR_LEVEL,
 
 	.ttbr = DART_T8110_TTBR,
 	.ttbr_valid = DART_T8110_TTBR_VALID,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ea2ef53bd4fe..7944a3af4545 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -724,7 +724,12 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev
 static int dma_info_to_prot(enum dma_data_direction dir, bool coherent,
 		     unsigned long attrs)
 {
-	int prot = coherent ? IOMMU_CACHE : 0;
+	int prot;
+
+	if (attrs & DMA_ATTR_MMIO)
+		prot = IOMMU_MMIO;
+	else
+		prot = coherent ? IOMMU_CACHE : 0;
 
 	if (attrs & DMA_ATTR_PRIVILEGED)
 		prot |= IOMMU_PRIV;
@@ -1190,11 +1195,9 @@ static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys,
 	return iova_offset(iovad, phys | size);
 }
 
-dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
-	      unsigned long offset, size_t size, enum dma_data_direction dir,
-	      unsigned long attrs)
+dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
-	phys_addr_t phys = page_to_phys(page) + offset;
 	bool coherent = dev_is_dma_coherent(dev);
 	int prot = dma_info_to_prot(dir, coherent, attrs);
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
@@ -1208,27 +1211,34 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	 */
 	if (dev_use_swiotlb(dev, size, dir) &&
 	    iova_unaligned(iovad, phys, size)) {
+		if (attrs & DMA_ATTR_MMIO)
+			return DMA_MAPPING_ERROR;
+
 		phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs);
 		if (phys == (phys_addr_t)DMA_MAPPING_ERROR)
 			return DMA_MAPPING_ERROR;
 	}
 
-	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
 		arch_sync_dma_for_device(phys, size, dir);
 
 	iova = __iommu_dma_map(dev, phys, size, prot, dma_mask);
-	if (iova == DMA_MAPPING_ERROR)
+	if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO))
 		swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 	return iova;
 }
 
-void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	phys_addr_t phys;
 
-	phys = iommu_iova_to_phys(domain, dma_handle);
+	if (attrs & DMA_ATTR_MMIO) {
+		__iommu_dma_unmap(dev, dma_handle, size);
+		return;
+	}
+
+	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
 	if (WARN_ON(!phys))
 		return;
 
@@ -1341,7 +1351,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s
 	int i;
 
 	for_each_sg(sg, s, nents, i)
-		iommu_dma_unmap_page(dev, sg_dma_address(s),
+		iommu_dma_unmap_phys(dev, sg_dma_address(s),
 				sg_dma_len(s), dir, attrs);
 }
 
@@ -1354,8 +1364,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg,
 	sg_dma_mark_swiotlb(sg);
 
 	for_each_sg(sg, s, nents, i) {
-		sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s),
-				s->offset, s->length, dir, attrs);
+		sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s),
+				s->length, dir, attrs);
 		if (sg_dma_address(s) == DMA_MAPPING_ERROR)
 			goto out_unmap;
 		sg_dma_len(s) = s->length;
@@ -1546,20 +1556,6 @@ void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 		__iommu_dma_unmap(dev, start, end - start);
 }
 
-dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	return __iommu_dma_map(dev, phys, size,
-			dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
-			dma_get_mask(dev));
-}
-
-void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	__iommu_dma_unmap(dev, handle, size);
-}
-
 static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 {
 	size_t alloc_size = PAGE_ALIGN(size);
@@ -1838,12 +1834,13 @@ static int __dma_iova_link(struct device *dev, dma_addr_t addr,
 		unsigned long attrs)
 {
 	bool coherent = dev_is_dma_coherent(dev);
+	int prot = dma_info_to_prot(dir, coherent, attrs);
 
-	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+	if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
 		arch_sync_dma_for_device(phys, size, dir);
 
 	return iommu_map_nosync(iommu_get_dma_domain(dev), addr, phys, size,
-			dma_info_to_prot(dir, coherent, attrs), GFP_ATOMIC);
+			prot, GFP_ATOMIC);
 }
 
 static int iommu_dma_iova_bounce_and_link(struct device *dev, dma_addr_t addr,
@@ -1949,9 +1946,13 @@ int dma_iova_link(struct device *dev, struct dma_iova_state *state,
 		return -EIO;
 
 	if (dev_use_swiotlb(dev, size, dir) &&
-	    iova_unaligned(iovad, phys, size))
+	    iova_unaligned(iovad, phys, size)) {
+		if (attrs & DMA_ATTR_MMIO)
+			return -EPERM;
+
 		return iommu_dma_iova_link_swiotlb(dev, state, phys, offset,
 				size, dir, attrs);
+	}
 
 	return __dma_iova_link(dev, state->addr + offset - iova_start_pad,
 			phys - iova_start_pad,
diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c
index affbf4a1558d..617fd81a80f0 100644
--- a/drivers/iommu/intel/debugfs.c
+++ b/drivers/iommu/intel/debugfs.c
@@ -62,8 +62,6 @@ static const struct iommu_regset iommu_regs_64[] = {
 	IOMMU_REGSET_ENTRY(CAP),
 	IOMMU_REGSET_ENTRY(ECAP),
 	IOMMU_REGSET_ENTRY(RTADDR),
-	IOMMU_REGSET_ENTRY(CCMD),
-	IOMMU_REGSET_ENTRY(AFLOG),
 	IOMMU_REGSET_ENTRY(PHMBASE),
 	IOMMU_REGSET_ENTRY(PHMLIMIT),
 	IOMMU_REGSET_ENTRY(IQH),
@@ -435,8 +433,21 @@ static int domain_translation_struct_show(struct seq_file *m,
 			}
 			pgd &= VTD_PAGE_MASK;
 		} else { /* legacy mode */
-			pgd = context->lo & VTD_PAGE_MASK;
-			agaw = context->hi & 7;
+			u8 tt = (u8)(context->lo & GENMASK_ULL(3, 2)) >> 2;
+
+			/*
+			 * According to Translation Type(TT),
+			 * get the page table pointer(SSPTPTR).
+			 */
+			switch (tt) {
+			case CONTEXT_TT_MULTI_LEVEL:
+			case CONTEXT_TT_DEV_IOTLB:
+				pgd = context->lo & VTD_PAGE_MASK;
+				agaw = context->hi & 7;
+				break;
+			default:
+				goto iommu_unlock;
+			}
 		}
 
 		seq_printf(m, "Device %04x:%02x:%02x.%x ",
@@ -648,17 +659,11 @@ DEFINE_SHOW_ATTRIBUTE(ir_translation_struct);
 static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu,
 			     struct dmar_drhd_unit *drhd)
 {
-	int ret;
-
 	seq_printf(m, "IOMMU: %s Register Base Address: %llx\n",
 		   iommu->name, drhd->reg_base_addr);
 
-	ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
-	if (ret < 0)
-		seq_puts(m, "Failed to get latency snapshot");
-	else
-		seq_puts(m, debug_buf);
-	seq_puts(m, "\n");
+	dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE);
+	seq_printf(m, "%s\n", debug_buf);
 }
 
 static int latency_show(struct seq_file *m, void *v)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index dff2d895b8ab..e236c7ec221f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3817,7 +3817,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev)
 			}
 
 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
-			    pci_pri_supported(pdev))
+			    ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
 				info->pri_supported = 1;
 		}
 	}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index d09b92871659..3056583d7f56 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -77,7 +77,6 @@
 #define	DMAR_FEDATA_REG	0x3c	/* Fault event interrupt data register */
 #define	DMAR_FEADDR_REG	0x40	/* Fault event interrupt addr register */
 #define	DMAR_FEUADDR_REG 0x44	/* Upper address register */
-#define	DMAR_AFLOG_REG	0x58	/* Advanced Fault control */
 #define	DMAR_PMEN_REG	0x64	/* Enable Protected Memory Region */
 #define	DMAR_PLMBASE_REG 0x68	/* PMRR Low addr */
 #define	DMAR_PLMLIMIT_REG 0x6c	/* PMRR low limit */
@@ -173,8 +172,6 @@
 #define cap_pgsel_inv(c)	(((c) >> 39) & 1)
 
 #define cap_super_page_val(c)	(((c) >> 34) & 0xf)
-#define cap_super_offset(c)	(((find_first_bit(&cap_super_page_val(c), 4)) \
-					* OFFSET_STRIDE) + 21)
 
 #define cap_fault_reg_offset(c)	((((c) >> 24) & 0x3ff) * 16)
 #define cap_max_fault_reg_offset(c) \
@@ -462,7 +459,6 @@ enum {
 #define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
 
 /* Page group response descriptor QW1 */
-#define QI_PGRP_LPIG(x)		(((u64)(x)) << 2)
 #define QI_PGRP_IDX(idx)	(((u64)(idx)) << 3)
 
 
@@ -541,7 +537,8 @@ enum {
 #define pasid_supported(iommu)	(sm_supported(iommu) &&			\
 				 ecap_pasid((iommu)->ecap))
 #define ssads_supported(iommu) (sm_supported(iommu) &&                 \
-				ecap_slads((iommu)->ecap))
+				ecap_slads((iommu)->ecap) &&           \
+				ecap_smpwc(iommu->ecap))
 #define nested_supported(iommu)	(sm_supported(iommu) &&			\
 				 ecap_nest((iommu)->ecap))
 
diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c
index adc4de6bbd88..dceeadc3ee7c 100644
--- a/drivers/iommu/intel/perf.c
+++ b/drivers/iommu/intel/perf.c
@@ -113,7 +113,7 @@ static char *latency_type_names[] = {
 	"     svm_prq"
 };
 
-int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
+void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 {
 	struct latency_statistic *lstat = iommu->perf_statistic;
 	unsigned long flags;
@@ -122,7 +122,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 	memset(str, 0, size);
 
 	for (i = 0; i < COUNTS_NUM; i++)
-		bytes += snprintf(str + bytes, size - bytes,
+		bytes += scnprintf(str + bytes, size - bytes,
 				  "%s", latency_counter_names[i]);
 
 	spin_lock_irqsave(&latency_lock, flags);
@@ -130,7 +130,7 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 		if (!dmar_latency_enabled(iommu, i))
 			continue;
 
-		bytes += snprintf(str + bytes, size - bytes,
+		bytes += scnprintf(str + bytes, size - bytes,
 				  "\n%s", latency_type_names[i]);
 
 		for (j = 0; j < COUNTS_NUM; j++) {
@@ -156,11 +156,9 @@ int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 				break;
 			}
 
-			bytes += snprintf(str + bytes, size - bytes,
+			bytes += scnprintf(str + bytes, size - bytes,
 					  "%12lld", val);
 		}
 	}
 	spin_unlock_irqrestore(&latency_lock, flags);
-
-	return bytes;
 }
diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h
index df9a36942d64..1d4baad7e852 100644
--- a/drivers/iommu/intel/perf.h
+++ b/drivers/iommu/intel/perf.h
@@ -40,7 +40,7 @@ void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type);
 bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type);
 void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type,
 			 u64 latency);
-int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
+void dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size);
 #else
 static inline int
 dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type)
@@ -64,9 +64,8 @@ dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 laten
 {
 }
 
-static inline int
+static inline void
 dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size)
 {
-	return 0;
 }
 #endif /* CONFIG_DMAR_PERF */
diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c
index 52570e42a14c..ff63c228e6e1 100644
--- a/drivers/iommu/intel/prq.c
+++ b/drivers/iommu/intel/prq.c
@@ -151,8 +151,7 @@ static void handle_bad_prq_event(struct intel_iommu *iommu,
 			QI_PGRP_PASID_P(req->pasid_present) |
 			QI_PGRP_RESP_CODE(result) |
 			QI_PGRP_RESP_TYPE;
-	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
-			QI_PGRP_LPIG(req->lpig);
+	desc.qw1 = QI_PGRP_IDX(req->prg_index);
 
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
@@ -379,19 +378,17 @@ void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt,
 	struct iommu_fault_page_request *prm;
 	struct qi_desc desc;
 	bool pasid_present;
-	bool last_page;
 	u16 sid;
 
 	prm = &evt->fault.prm;
 	sid = PCI_DEVID(bus, devfn);
 	pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
-	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 
 	desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
 			QI_PGRP_PASID_P(pasid_present) |
 			QI_PGRP_RESP_CODE(msg->code) |
 			QI_PGRP_RESP_TYPE;
-	desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
+	desc.qw1 = QI_PGRP_IDX(prm->grpid);
 	desc.qw2 = 0;
 	desc.qw3 = 0;
 
diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c
index 679bda104797..54d287cc0dd1 100644
--- a/drivers/iommu/io-pgtable-dart.c
+++ b/drivers/iommu/io-pgtable-dart.c
@@ -27,8 +27,9 @@
 
 #define DART1_MAX_ADDR_BITS	36
 
-#define DART_MAX_TABLES		4
-#define DART_LEVELS		2
+#define DART_MAX_TABLE_BITS	2
+#define DART_MAX_TABLES		BIT(DART_MAX_TABLE_BITS)
+#define DART_MAX_LEVELS		4 /* Includes TTBR level */
 
 /* Struct accessors */
 #define io_pgtable_to_data(x)						\
@@ -68,6 +69,7 @@
 struct dart_io_pgtable {
 	struct io_pgtable	iop;
 
+	int			levels;
 	int			tbl_bits;
 	int			bits_per_level;
 
@@ -156,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table,
 	return old;
 }
 
-static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova)
+static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level)
 {
-	return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
-		((1 << data->tbl_bits) - 1);
+	return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
+		((1 << data->bits_per_level) - 1);
 }
 
-static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova)
-{
-
-	return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
-		 ((1 << data->bits_per_level) - 1);
-}
-
-static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova)
+static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova)
 {
 
 	return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) &
 		 ((1 << data->bits_per_level) - 1);
 }
 
-static  dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova)
+static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova)
 {
 	dart_iopte pte, *ptep;
-	int tbl = dart_get_table(data, iova);
+	int level = data->levels;
+	int tbl = dart_get_index(data, iova, level);
+
+	if (tbl >= (1 << data->tbl_bits))
+		return NULL;
 
 	ptep = data->pgd[tbl];
 	if (!ptep)
 		return NULL;
 
-	ptep += dart_get_l1_index(data, iova);
-	pte = READ_ONCE(*ptep);
+	while (--level > 1) {
+		ptep += dart_get_index(data, iova, level);
+		pte = READ_ONCE(*ptep);
 
-	/* Valid entry? */
-	if (!pte)
-		return NULL;
+		/* Valid entry? */
+		if (!pte)
+			return NULL;
 
-	/* Deref to get level 2 table */
-	return iopte_deref(pte, data);
+		/* Deref to get next level table */
+		ptep = iopte_deref(pte, data);
+	}
+
+	return ptep;
 }
 
 static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data,
@@ -230,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
 	int ret = 0, tbl, num_entries, max_entries, map_idx_start;
 	dart_iopte pte, *cptep, *ptep;
 	dart_iopte prot;
+	int level = data->levels;
 
 	if (WARN_ON(pgsize != cfg->pgsize_bitmap))
 		return -EINVAL;
@@ -240,31 +244,36 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova,
 	if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
 		return -EINVAL;
 
-	tbl = dart_get_table(data, iova);
+	tbl = dart_get_index(data, iova, level);
+
+	if (tbl >= (1 << data->tbl_bits))
+		return -ENOMEM;
 
 	ptep = data->pgd[tbl];
-	ptep += dart_get_l1_index(data, iova);
-	pte = READ_ONCE(*ptep);
+	while (--level > 1) {
+		ptep += dart_get_index(data, iova, level);
+		pte = READ_ONCE(*ptep);
 
-	/* no L2 table present */
-	if (!pte) {
-		cptep = iommu_alloc_pages_sz(gfp, tblsz);
-		if (!cptep)
-			return -ENOMEM;
+		/* no table present */
+		if (!pte) {
+			cptep = iommu_alloc_pages_sz(gfp, tblsz);
+			if (!cptep)
+				return -ENOMEM;
 
-		pte = dart_install_table(cptep, ptep, 0, data);
-		if (pte)
-			iommu_free_pages(cptep);
+			pte = dart_install_table(cptep, ptep, 0, data);
+			if (pte)
+				iommu_free_pages(cptep);
 
-		/* L2 table is present (now) */
-		pte = READ_ONCE(*ptep);
-	}
+			/* L2 table is present (now) */
+			pte = READ_ONCE(*ptep);
+		}
 
-	ptep = iopte_deref(pte, data);
+		ptep = iopte_deref(pte, data);
+	}
 
 	/* install a leaf entries into L2 table */
 	prot = dart_prot_to_pte(data, iommu_prot);
-	map_idx_start = dart_get_l2_index(data, iova);
+	map_idx_start = dart_get_last_index(data, iova);
 	max_entries = DART_PTES_PER_TABLE(data) - map_idx_start;
 	num_entries = min_t(int, pgcount, max_entries);
 	ptep += map_idx_start;
@@ -293,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova,
 	if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount))
 		return 0;
 
-	ptep = dart_get_l2(data, iova);
+	ptep = dart_get_last(data, iova);
 
 	/* Valid L2 IOPTE pointer? */
 	if (WARN_ON(!ptep))
 		return 0;
 
-	unmap_idx_start = dart_get_l2_index(data, iova);
+	unmap_idx_start = dart_get_last_index(data, iova);
 	ptep += unmap_idx_start;
 
 	max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start;
@@ -330,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops,
 	struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	dart_iopte pte, *ptep;
 
-	ptep = dart_get_l2(data, iova);
+	ptep = dart_get_last(data, iova);
 
 	/* Valid L2 IOPTE pointer? */
 	if (!ptep)
 		return 0;
 
-	ptep += dart_get_l2_index(data, iova);
+	ptep += dart_get_last_index(data, iova);
 
 	pte = READ_ONCE(*ptep);
 	/* Found translation */
@@ -353,21 +362,37 @@ static struct dart_io_pgtable *
 dart_alloc_pgtable(struct io_pgtable_cfg *cfg)
 {
 	struct dart_io_pgtable *data;
-	int tbl_bits, bits_per_level, va_bits, pg_shift;
+	int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift;
+
+	/*
+	 * Old 4K page DARTs can use up to 4 top-level tables.
+	 * Newer ones only ever use a maximum of 1.
+	 */
+	if (cfg->pgsize_bitmap == SZ_4K)
+		max_tbl_bits = DART_MAX_TABLE_BITS;
+	else
+		max_tbl_bits = 0;
 
 	pg_shift = __ffs(cfg->pgsize_bitmap);
 	bits_per_level = pg_shift - ilog2(sizeof(dart_iopte));
 
 	va_bits = cfg->ias - pg_shift;
 
-	tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS));
-	if ((1 << tbl_bits) > DART_MAX_TABLES)
+	levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level);
+
+	if (levels > (DART_MAX_LEVELS - 1))
+		return NULL;
+
+	tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels));
+
+	if (tbl_bits > max_tbl_bits)
 		return NULL;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return NULL;
 
+	data->levels = levels + 1; /* Table level counts as one level */
 	data->tbl_bits = tbl_bits;
 	data->bits_per_level = bits_per_level;
 
@@ -403,6 +428,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
 		return NULL;
 
 	cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits;
+	cfg->apple_dart_cfg.n_levels = data->levels;
 
 	for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) {
 		data->pgd[i] =
@@ -422,24 +448,31 @@ out_free_data:
 	return NULL;
 }
 
-static void apple_dart_free_pgtable(struct io_pgtable *iop)
+static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level)
 {
-	struct dart_io_pgtable *data = io_pgtable_to_data(iop);
-	dart_iopte *ptep, *end;
-	int i;
+	dart_iopte *end;
+	dart_iopte *start = ptep;
 
-	for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) {
-		ptep = data->pgd[i];
+	if (level > 1) {
 		end = (void *)ptep + DART_GRANULE(data);
 
 		while (ptep != end) {
 			dart_iopte pte = *ptep++;
 
 			if (pte)
-				iommu_free_pages(iopte_deref(pte, data));
+				apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1);
 		}
-		iommu_free_pages(data->pgd[i]);
 	}
+	iommu_free_pages(start);
+}
+
+static void apple_dart_free_pgtable(struct io_pgtable *iop)
+{
+	struct dart_io_pgtable *data = io_pgtable_to_data(iop);
+	int i;
+
+	for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i)
+		apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1);
 
 	kfree(data);
 }
diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index e236b932e766..c95394cd03a7 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -37,6 +37,8 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
 				 const struct bus_type *bus,
 				 struct notifier_block *nb);
 
+int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu);
+
 struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group,
 						    ioasid_t pasid,
 						    unsigned int type);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 060ebe330ee1..59244c744eab 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -304,6 +304,7 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
 				 struct notifier_block *nb)
 {
 	bus_unregister_notifier(bus, nb);
+	fwnode_remove_software_node(iommu->fwnode);
 	iommu_device_unregister(iommu);
 }
 EXPORT_SYMBOL_GPL(iommu_device_unregister_bus);
@@ -326,6 +327,12 @@ int iommu_device_register_bus(struct iommu_device *iommu,
 	if (err)
 		return err;
 
+	iommu->fwnode = fwnode_create_software_node(NULL, NULL);
+	if (IS_ERR(iommu->fwnode)) {
+		bus_unregister_notifier(bus, nb);
+		return PTR_ERR(iommu->fwnode);
+	}
+
 	spin_lock(&iommu_device_lock);
 	list_add_tail(&iommu->list, &iommu_device_list);
 	spin_unlock(&iommu_device_lock);
@@ -335,9 +342,28 @@ int iommu_device_register_bus(struct iommu_device *iommu,
 		iommu_device_unregister_bus(iommu, bus, nb);
 		return err;
 	}
+	WRITE_ONCE(iommu->ready, true);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_device_register_bus);
+
+int iommu_mock_device_add(struct device *dev, struct iommu_device *iommu)
+{
+	int rc;
+
+	mutex_lock(&iommu_probe_device_lock);
+	rc = iommu_fwspec_init(dev, iommu->fwnode);
+	mutex_unlock(&iommu_probe_device_lock);
+
+	if (rc)
+		return rc;
+
+	rc = device_add(dev);
+	if (rc)
+		iommu_fwspec_free(dev);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(iommu_mock_device_add);
 #endif
 
 static struct dev_iommu *dev_iommu_get(struct device *dev)
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 61686603c769..de178827a078 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1126,7 +1126,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 		goto err_put;
 	}
 
-	rc = device_add(&mdev->dev);
+	rc = iommu_mock_device_add(&mdev->dev, &mock_iommu.iommu_dev);
 	if (rc)
 		goto err_put;
 	return mdev;
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 6fb93927bdb9..5c6f5943f44b 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1303,8 +1303,8 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
 	struct omap_iommu_device *iommu;
 	struct omap_iommu *oiommu;
 	struct iotlb_entry e;
+	int ret = -EINVAL;
 	int omap_pgsz;
-	u32 ret = -EINVAL;
 	int i;
 
 	omap_pgsz = bytes_to_iopgsz(bytes);
diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c
index 725e919b97ef..83a28c83f991 100644
--- a/drivers/iommu/riscv/iommu-platform.c
+++ b/drivers/iommu/riscv/iommu-platform.c
@@ -10,6 +10,8 @@
  *	Tomasz Jeznach <tjeznach@rivosinc.com>
  */
 
+#include <linux/acpi.h>
+#include <linux/irqchip/riscv-imsic.h>
 #include <linux/kernel.h>
 #include <linux/msi.h>
 #include <linux/of_irq.h>
@@ -46,6 +48,7 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
 	enum riscv_iommu_igs_settings igs;
 	struct device *dev = &pdev->dev;
 	struct riscv_iommu_device *iommu = NULL;
+	struct irq_domain *msi_domain;
 	struct resource *res = NULL;
 	int vec, ret;
 
@@ -76,8 +79,13 @@ static int riscv_iommu_platform_probe(struct platform_device *pdev)
 	switch (igs) {
 	case RISCV_IOMMU_CAPABILITIES_IGS_BOTH:
 	case RISCV_IOMMU_CAPABILITIES_IGS_MSI:
-		if (is_of_node(dev->fwnode))
+		if (is_of_node(dev_fwnode(dev))) {
 			of_msi_configure(dev, to_of_node(dev->fwnode));
+		} else {
+			msi_domain = irq_find_matching_fwnode(imsic_acpi_get_fwnode(dev),
+							      DOMAIN_BUS_PLATFORM_MSI);
+			dev_set_msi_domain(dev, msi_domain);
+		}
 
 		if (!dev_get_msi_domain(dev)) {
 			dev_warn(dev, "failed to find an MSI domain\n");
@@ -150,6 +158,12 @@ static const struct of_device_id riscv_iommu_of_match[] = {
 	{},
 };
 
+static const struct acpi_device_id riscv_iommu_acpi_match[] = {
+	{ "RSCV0004", 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, riscv_iommu_acpi_match);
+
 static struct platform_driver riscv_iommu_platform_driver = {
 	.probe = riscv_iommu_platform_probe,
 	.remove = riscv_iommu_platform_remove,
@@ -158,6 +172,7 @@ static struct platform_driver riscv_iommu_platform_driver = {
 		.name = "riscv,iommu",
 		.of_match_table = riscv_iommu_of_match,
 		.suppress_bind_attrs = true,
+		.acpi_match_table = riscv_iommu_acpi_match,
 	},
 };
 
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 0eae2f4bdc5e..ebb22979075d 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -12,6 +12,8 @@
 
 #define pr_fmt(fmt) "riscv-iommu: " fmt
 
+#include <linux/acpi.h>
+#include <linux/acpi_rimt.h>
 #include <linux/compiler.h>
 #include <linux/crash_dump.h>
 #include <linux/init.h>
@@ -1650,6 +1652,14 @@ int riscv_iommu_init(struct riscv_iommu_device *iommu)
 		goto err_iodir_off;
 	}
 
+	if (!acpi_disabled) {
+		rc = rimt_iommu_register(iommu->dev);
+		if (rc) {
+			dev_err_probe(iommu->dev, rc, "cannot register iommu with RIMT\n");
+			goto err_remove_sysfs;
+		}
+	}
+
 	rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev);
 	if (rc) {
 		dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n");
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 07c19b2182ca..104aa5355090 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -688,4 +688,6 @@ config DM_AUDIT
 
 source "drivers/md/dm-vdo/Kconfig"
 
+source "drivers/md/dm-pcache/Kconfig"
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5a51b3408b70..c338cc6fbe2e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_DM_RAID)		+= dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_VDO)            += dm-vdo/
+obj-$(CONFIG_DM_PCACHE)		+= dm-pcache/
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_EBS)		+= dm-ebs.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 8f3a23f4b168..e6d28be11c5c 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1337,7 +1337,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
 	char *ptr;
 	unsigned int len;
 
-	bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
+	bio = bio_kmalloc(1, GFP_NOWAIT);
 	if (!bio) {
 		use_dmio(b, op, sector, n_sectors, offset, ioprio);
 		return;
@@ -1601,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	 * dm-bufio is resistant to allocation failures (it just keeps
 	 * one buffer reserved in cases all the allocations fail).
 	 * So set flags to not try too hard:
-	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
-	 *		    mutex and wait ourselves.
+	 *	GFP_NOWAIT: don't wait and don't print a warning in case of
+	 *		    failure; if we need to sleep we'll release our mutex
+	 *		    and wait ourselves.
 	 *	__GFP_NORETRY: don't retry and rather return failure
 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
-	 *	__GFP_NOWARN: don't print a warning in case of failure
 	 *
 	 * For debugging, if we set the cache size to 1, no new buffers will
 	 * be allocated.
 	 */
 	while (1) {
 		if (dm_bufio_cache_size_latch != 1) {
-			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC);
 			if (b)
 				return b;
 		}
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 2ed894155cab..7e1e8cc0e33a 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in
 	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
 	ht->hash_bits = __ffs(nr_buckets);
 
-	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
+	ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets));
 	if (!ht->buckets)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index c889332e533b..a3c9f74fe2dc 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -162,6 +162,7 @@ struct mapped_device {
 #define DMF_SUSPENDED_INTERNALLY 7
 #define DMF_POST_SUSPENDING 8
 #define DMF_EMULATE_ZONE_APPEND 9
+#define DMF_QUEUE_STOPPED 10
 
 static inline sector_t dm_get_size(struct mapped_device *md)
 {
@@ -291,6 +292,7 @@ struct dm_io {
 	struct dm_io *next;
 	struct dm_stats_aux stats_aux;
 	blk_status_t status;
+	bool requeue_flush_with_data;
 	atomic_t io_count;
 	struct mapped_device *md;
 
diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c
index 8b50c908c6f4..efb3cd4f9cd4 100644
--- a/drivers/md/dm-ima.c
+++ b/drivers/md/dm-ima.c
@@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf)
 /*
  * Internal function to allocate memory for IMA measurements.
  */
-static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
+static void *dm_ima_alloc(size_t len, bool noio)
 {
 	unsigned int noio_flag;
 	void *ptr;
@@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
 	if (noio)
 		noio_flag = memalloc_noio_save();
 
-	ptr = kzalloc(len, flags);
+	ptr = kzalloc(len, GFP_KERNEL);
 
 	if (noio)
 		memalloc_noio_restore(noio_flag);
@@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_
 					   char **dev_uuid, bool noio)
 {
 	int r;
-	*dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio);
+	*dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio);
 	if (!(*dev_name)) {
 		r = -ENOMEM;
 		goto error;
 	}
 
-	*dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio);
+	*dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio);
 	if (!(*dev_uuid)) {
 		r = -ENOMEM;
 		goto error;
@@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de
 	if (r)
 		return r;
 
-	*device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+	*device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
 	if (!(*device_data)) {
 		r = -ENOMEM;
 		goto error;
@@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c
 
 	capacity = get_capacity(md->disk);
 
-	*capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio);
+	*capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio);
 	if (!(*capacity_str))
 		return -ENOMEM;
 
-	scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
-		  capacity);
-
-	return 0;
+	return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
+			 capacity);
 }
 
 /*
@@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1;
 	char table_load_event_name[] = "dm_table_load";
 
-	ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio);
+	ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio);
 	if (!ima_buf)
 		return;
 
-	target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio);
+	target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio);
 	if (!target_metadata_buf)
 		goto error;
 
-	target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio);
+	target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio);
 	if (!target_data_buf)
 		goto error;
 
@@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 
 	shash->tfm = tfm;
 	digest_size = crypto_shash_digestsize(tfm);
-	digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio);
+	digest = dm_ima_alloc(digest_size, noio);
 	if (!digest)
 		goto error;
 
@@ -327,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
 	if (r < 0)
 		goto error;
 
-	digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio);
+	digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio);
 
 	if (!digest_buf)
 		goto error;
@@ -371,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
 {
 	char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
 	char active[] = "active_table_hash=";
-	unsigned int active_len = strlen(active), capacity_len = 0;
+	unsigned int active_len = strlen(active);
 	unsigned int l = 0;
 	bool noio = true;
 	bool nodata = true;
-	int r;
+	int capacity_len;
 
-	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
 	if (!device_table_data)
 		return;
 
-	r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
-	if (r)
+	capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+	if (capacity_len < 0)
 		goto error;
 
 	memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
@@ -445,8 +443,7 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
 	}
 
 	if (nodata) {
-		r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio);
-		if (r)
+		if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio))
 			goto error;
 
 		l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
@@ -454,7 +451,6 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
 			      DM_IMA_VERSION_STR, dev_name, dev_uuid);
 	}
 
-	capacity_len = strlen(capacity_str);
 	memcpy(device_table_data + l, capacity_str, capacity_len);
 	l += capacity_len;
 
@@ -483,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
 	unsigned int device_active_len = strlen(device_active_str);
 	unsigned int device_inactive_len = strlen(device_inactive_str);
 	unsigned int remove_all_len = strlen(remove_all_str);
-	unsigned int capacity_len = 0;
 	unsigned int l = 0;
 	bool noio = true;
 	bool nodata = true;
-	int r;
+	int capacity_len;
 
-	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio);
+	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio);
 	if (!device_table_data)
 		goto exit;
 
-	r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
-	if (r) {
+	capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+	if (capacity_len < 0) {
 		kfree(device_table_data);
 		goto exit;
 	}
@@ -570,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
 	memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2);
 	l += 2;
 
-	capacity_len = strlen(capacity_str);
 	memcpy(device_table_data + l, capacity_str, capacity_len);
 	l += capacity_len;
 
@@ -602,20 +596,20 @@ exit:
  */
 void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
 {
-	unsigned int l = 0, capacity_len = 0;
+	unsigned int l = 0;
 	char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
 	char inactive_str[] = "inactive_table_hash=";
 	unsigned int inactive_len = strlen(inactive_str);
 	bool noio = true;
 	bool nodata = true;
-	int r;
+	int capacity_len;
 
-	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
+	device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
 	if (!device_table_data)
 		return;
 
-	r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
-	if (r)
+	capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
+	if (capacity_len < 0)
 		goto error1;
 
 	memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
@@ -650,7 +644,6 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
 			      DM_IMA_VERSION_STR, dev_name, dev_uuid);
 	}
 
-	capacity_len = strlen(capacity_str);
 	memcpy(device_table_data + l, capacity_str, capacity_len);
 	l += capacity_len;
 
@@ -703,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
 	char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL;
 	char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL;
 	bool noio = true;
-	int r, len;
+	int len;
 
 	if (dm_ima_alloc_and_copy_device_data(md, &new_device_data,
 					      md->ima.active_table.num_targets, noio))
@@ -712,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
 	if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio))
 		goto error;
 
-	combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio);
+	combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio);
 	if (!combined_device_data)
 		goto error;
 
-	r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
-	if (r)
+	if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0)
 		goto error;
 
 	old_device_data = md->ima.active_table.device_metadata;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ab96b692e5a3..170bf67a2edd 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -219,10 +219,13 @@ struct dm_integrity_c {
 	__u8 log2_blocks_per_bitmap_bit;
 
 	unsigned char mode;
+	bool internal_hash;
 
 	int failed;
 
-	struct crypto_shash *internal_hash;
+	struct crypto_shash *internal_shash;
+	struct crypto_ahash *internal_ahash;
+	unsigned int internal_hash_digestsize;
 
 	struct dm_target *ti;
 
@@ -277,6 +280,9 @@ struct dm_integrity_c {
 	bool fix_hmac;
 	bool legacy_recalculate;
 
+	mempool_t ahash_req_pool;
+	struct ahash_request *journal_ahash_req;
+
 	struct alg_spec internal_hash_alg;
 	struct alg_spec journal_crypt_alg;
 	struct alg_spec journal_mac_alg;
@@ -326,6 +332,8 @@ struct dm_integrity_io {
 	unsigned payload_len;
 	bool integrity_payload_from_mempool;
 	bool integrity_range_locked;
+
+	struct ahash_request *ahash_req;
 };
 
 struct journal_completion {
@@ -352,6 +360,7 @@ struct bitmap_block_status {
 static struct kmem_cache *journal_io_cache;
 
 #define JOURNAL_IO_MEMPOOL	32
+#define AHASH_MEMPOOL		32
 
 #ifdef DEBUG_PRINT
 #define DEBUG_print(x, ...)			printk(KERN_DEBUG x, ##__VA_ARGS__)
@@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio)
 	dec_in_flight(dio);
 }
 
-static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
-				      const char *data, char *result)
+static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector,
+					    const char *data, unsigned offset, char *result)
 {
 	__le64 sector_le = cpu_to_le64(sector);
-	SHASH_DESC_ON_STACK(req, ic->internal_hash);
+	SHASH_DESC_ON_STACK(req, ic->internal_shash);
 	int r;
 	unsigned int digest_size;
 
-	req->tfm = ic->internal_hash;
+	req->tfm = ic->internal_shash;
 
 	r = crypto_shash_init(req);
 	if (unlikely(r < 0)) {
@@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
 		goto failed;
 	}
 
-	r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
+	r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT);
 	if (unlikely(r < 0)) {
 		dm_integrity_io_error(ic, "crypto_shash_update", r);
 		goto failed;
@@ -1676,7 +1685,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
 		goto failed;
 	}
 
-	digest_size = crypto_shash_digestsize(ic->internal_hash);
+	digest_size = ic->internal_hash_digestsize;
 	if (unlikely(digest_size < ic->tag_size))
 		memset(result + digest_size, 0, ic->tag_size - digest_size);
 
@@ -1687,6 +1696,104 @@ failed:
 	get_random_bytes(result, ic->tag_size);
 }
 
+static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
+					    sector_t sector, struct page *page, unsigned offset, char *result)
+{
+	__le64 sector_le = cpu_to_le64(sector);
+	struct ahash_request *req;
+	DECLARE_CRYPTO_WAIT(wait);
+	struct scatterlist sg[3], *s = sg;
+	int r;
+	unsigned int digest_size;
+	unsigned int nbytes = 0;
+
+	might_sleep();
+
+	req = *ahash_req;
+	if (unlikely(!req)) {
+		req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO);
+		*ahash_req = req;
+	}
+
+	ahash_request_set_tfm(req, ic->internal_ahash);
+	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait);
+
+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
+		sg_init_table(sg, 3);
+		sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE);
+		nbytes += SALT_SIZE;
+		s++;
+	} else {
+		sg_init_table(sg, 2);
+	}
+
+	if (likely(!is_vmalloc_addr(&sector_le))) {
+		sg_set_buf(s, &sector_le, sizeof(sector_le));
+	} else {
+		struct page *sec_page = vmalloc_to_page(&sector_le);
+		unsigned int sec_off = offset_in_page(&sector_le);
+		sg_set_page(s, sec_page, sizeof(sector_le), sec_off);
+	}
+	nbytes += sizeof(sector_le);
+	s++;
+
+	sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset);
+	nbytes += ic->sectors_per_block << SECTOR_SHIFT;
+
+	ahash_request_set_crypt(req, sg, result, nbytes);
+
+	r = crypto_wait_req(crypto_ahash_digest(req), &wait);
+	if (unlikely(r)) {
+		dm_integrity_io_error(ic, "crypto_ahash_digest", r);
+		goto failed;
+	}
+
+	digest_size = ic->internal_hash_digestsize;
+	if (unlikely(digest_size < ic->tag_size))
+		memset(result + digest_size, 0, ic->tag_size - digest_size);
+
+	return;
+
+failed:
+	/* this shouldn't happen anyway, the hash functions have no reason to fail */
+	get_random_bytes(result, ic->tag_size);
+}
+
+static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
+				      sector_t sector, const char *data, unsigned offset, char *result)
+{
+	if (likely(ic->internal_shash != NULL))
+		integrity_sector_checksum_shash(ic, sector, data, offset, result);
+	else
+		integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result);
+}
+
+static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p)
+{
+	if (likely(ic->internal_shash != NULL))
+		return kmap_local_page(p);
+	else
+		return p;
+}
+
+static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr)
+{
+	if (likely(ic->internal_shash != NULL))
+		kunmap_local(ptr);
+}
+
+static void *integrity_identity(struct dm_integrity_c *ic, void *data)
+{
+#ifdef CONFIG_DEBUG_SG
+	BUG_ON(offset_in_page(data));
+	BUG_ON(!virt_addr_valid(data));
+#endif
+	if (likely(ic->internal_shash != NULL))
+		return data;
+	else
+		return virt_to_page(data);
+}
+
 static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum)
 {
 	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
@@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
 			sector_t alignment;
 			char *mem;
 			char *buffer = page_to_virt(page);
+			unsigned int buffer_offset;
 			int r;
 			struct dm_io_request io_req;
 			struct dm_io_region io_loc;
@@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
 			alignment &= -alignment;
 			io_loc.sector = round_down(io_loc.sector, alignment);
 			io_loc.count += sector - io_loc.sector;
-			buffer += (sector - io_loc.sector) << SECTOR_SHIFT;
+			buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT;
 			io_loc.count = round_up(io_loc.count, alignment);
 
 			r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
@@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
 				goto free_ret;
 			}
 
-			integrity_sector_checksum(ic, logical_sector, buffer, checksum);
+			integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum);
 			r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block,
 						&dio->metadata_offset, ic->tag_size, TAG_CMP);
 			if (r) {
@@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
 			}
 
 			mem = bvec_kmap_local(&bv);
-			memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT);
+			memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT);
 			kunmap_local(mem);
 
 			pos += ic->sectors_per_block << SECTOR_SHIFT;
@@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w)
 	if (ic->internal_hash) {
 		struct bvec_iter iter;
 		struct bio_vec bv;
-		unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
+		unsigned int digest_size = ic->internal_hash_digestsize;
 		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 		char *checksums;
 		unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
@@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w)
 			char *mem, *checksums_ptr;
 
 again:
-			mem = bvec_kmap_local(&bv_copy);
+			mem = integrity_kmap(ic, bv_copy.bv_page);
 			pos = 0;
 			checksums_ptr = checksums;
 			do {
-				integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
+				integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr);
 				checksums_ptr += ic->tag_size;
 				sectors_to_process -= ic->sectors_per_block;
 				pos += ic->sectors_per_block << SECTOR_SHIFT;
 				sector += ic->sectors_per_block;
 			} while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
-			kunmap_local(mem);
+			integrity_kunmap(ic, mem);
 
 			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
 						checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
@@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 	dio->ic = ic;
 	dio->bi_status = 0;
 	dio->op = bio_op(bio);
+	dio->ahash_req = NULL;
 
 	if (ic->mode == 'I') {
 		bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
@@ -2071,19 +2180,6 @@ retry_kmap:
 					js++;
 					mem_ptr += 1 << SECTOR_SHIFT;
 				} while (++s < ic->sectors_per_block);
-#ifdef INTERNAL_VERIFY
-				if (ic->internal_hash) {
-					char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
-
-					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
-					if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
-						DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
-							    logical_sector);
-						dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
-								 bio, logical_sector, 0);
-					}
-				}
-#endif
 			}
 
 			if (!ic->internal_hash) {
@@ -2124,15 +2220,17 @@ retry_kmap:
 				} while (++s < ic->sectors_per_block);
 
 				if (ic->internal_hash) {
-					unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
+					unsigned int digest_size = ic->internal_hash_digestsize;
+					void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
+					unsigned js_offset = offset_in_page(js);
 
 					if (unlikely(digest_size > ic->tag_size)) {
 						char checksums_onstack[HASH_MAX_DIGESTSIZE];
 
-						integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
+						integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack);
 						memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
 					} else
-						integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
+						integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je));
 				}
 
 				journal_entry_set_sector(je, logical_sector);
@@ -2428,7 +2526,7 @@ retry:
 	if (!dio->integrity_payload) {
 		unsigned digest_size, extra_size;
 		dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
-		digest_size = crypto_shash_digestsize(ic->internal_hash);
+		digest_size = ic->internal_hash_digestsize;
 		extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
 		dio->payload_len += extra_size;
 		dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -2505,11 +2603,11 @@ skip_spinlock:
 		unsigned pos = 0;
 		while (dio->bio_details.bi_iter.bi_size) {
 			struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
-			const char *mem = bvec_kmap_local(&bv);
+			const char *mem = integrity_kmap(ic, bv.bv_page);
 			if (ic->tag_size < ic->tuple_size)
 				memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size);
-			integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos);
-			kunmap_local(mem);
+			integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos);
+			integrity_kunmap(ic, mem);
 			pos += ic->tuple_size;
 			bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
 		}
@@ -2588,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
 		}
 		bio_put(outgoing_bio);
 
-		integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
-		if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+		integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest);
+		if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) {
 			DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
 				ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
 			atomic64_inc(&ic->number_of_mismatches);
@@ -2612,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
 	bio_endio(bio);
 }
 
+static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+	unsigned pos = 0;
+
+	while (dio->bio_details.bi_iter.bi_size) {
+		char digest[HASH_MAX_DIGESTSIZE];
+		struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
+		char *mem = integrity_kmap(ic, bv.bv_page);
+		integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest);
+		if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
+				min(ic->internal_hash_digestsize, ic->tag_size)))) {
+			integrity_kunmap(ic, mem);
+			dm_integrity_free_payload(dio);
+			INIT_WORK(&dio->work, dm_integrity_inline_recheck);
+			queue_work(ic->offload_wq, &dio->work);
+			return false;
+		}
+		integrity_kunmap(ic, mem);
+		pos += ic->tuple_size;
+		bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+	}
+
+	return true;
+}
+
+static void dm_integrity_inline_async_check(struct work_struct *w)
+{
+	struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
+	struct dm_integrity_c *ic = dio->ic;
+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+	if (likely(dm_integrity_check(ic, dio)))
+		bio_endio(bio);
+}
+
 static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
 {
 	struct dm_integrity_c *ic = ti->private;
+	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
 	if (ic->mode == 'I') {
-		struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
-		if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
-			unsigned pos = 0;
+		if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) {
 			if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
 			    unlikely(dio->integrity_range_locked))
-				goto skip_check;
-			while (dio->bio_details.bi_iter.bi_size) {
-				char digest[HASH_MAX_DIGESTSIZE];
-				struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
-				char *mem = bvec_kmap_local(&bv);
-				//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
-				integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
-				if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
-						min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
-					kunmap_local(mem);
-					dm_integrity_free_payload(dio);
-					INIT_WORK(&dio->work, dm_integrity_inline_recheck);
-					queue_work(ic->offload_wq, &dio->work);
+			    	goto skip_check;
+			if (likely(ic->internal_shash != NULL)) {
+				if (unlikely(!dm_integrity_check(ic, dio)))
 					return DM_ENDIO_INCOMPLETE;
-				}
-				kunmap_local(mem);
-				pos += ic->tuple_size;
-				bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
+			} else {
+				INIT_WORK(&dio->work, dm_integrity_inline_async_check);
+				queue_work(ic->offload_wq, &dio->work);
+				return DM_ENDIO_INCOMPLETE;
 			}
 		}
 skip_check:
@@ -2646,6 +2769,8 @@ skip_check:
 		if (unlikely(dio->integrity_range_locked))
 			remove_range(ic, &dio->range);
 	}
+	if (unlikely(dio->ahash_req))
+		mempool_free(dio->ahash_req, &ic->ahash_req_pool);
 	return DM_ENDIO_DONE;
 }
 
@@ -2902,9 +3027,12 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
 #endif
 				    ic->internal_hash) {
 					char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+					struct journal_sector *js = access_journal_data(ic, i, l);
+					void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
+					unsigned js_offset = offset_in_page(js);
 
-					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
-								  (char *)access_journal_data(ic, i, l), test_tag);
+					integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block),
+								  js_page, js_offset, test_tag);
 					if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
 						dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
 						dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
@@ -2987,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w)
 	size_t recalc_tags_size;
 	u8 *recalc_buffer = NULL;
 	u8 *recalc_tags = NULL;
+	struct ahash_request *ahash_req = NULL;
 	struct dm_integrity_range range;
 	struct dm_io_request io_req;
 	struct dm_io_region io_loc;
@@ -3001,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w)
 	unsigned recalc_sectors = RECALC_SECTORS;
 
 retry:
-	recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
+	recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
 	if (!recalc_buffer) {
 oom:
 		recalc_sectors >>= 1;
@@ -3011,11 +3140,11 @@ oom:
 		goto free_ret;
 	}
 	recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
-	if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
-		recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
+	if (ic->internal_hash_digestsize > ic->tag_size)
+		recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size;
 	recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
 	if (!recalc_tags) {
-		vfree(recalc_buffer);
+		kfree(recalc_buffer);
 		recalc_buffer = NULL;
 		goto oom;
 	}
@@ -3081,7 +3210,7 @@ next_chunk:
 		goto err;
 
 	io_req.bi_opf = REQ_OP_READ;
-	io_req.mem.type = DM_IO_VMA;
+	io_req.mem.type = DM_IO_KMEM;
 	io_req.mem.ptr.addr = recalc_buffer;
 	io_req.notify.fn = NULL;
 	io_req.client = ic->io;
@@ -3097,7 +3226,10 @@ next_chunk:
 
 	t = recalc_tags;
 	for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
-		integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+		void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
+		void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
+		unsigned ptr_offset = offset_in_page(ptr);
+		integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t);
 		t += ic->tag_size;
 	}
 
@@ -3139,8 +3271,9 @@ unlock_ret:
 	recalc_write_super(ic);
 
 free_ret:
-	vfree(recalc_buffer);
+	kfree(recalc_buffer);
 	kvfree(recalc_tags);
+	mempool_free(ahash_req, &ic->ahash_req_pool);
 }
 
 static void integrity_recalc_inline(struct work_struct *w)
@@ -3149,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w)
 	size_t recalc_tags_size;
 	u8 *recalc_buffer = NULL;
 	u8 *recalc_tags = NULL;
+	struct ahash_request *ahash_req = NULL;
 	struct dm_integrity_range range;
 	struct bio *bio;
 	struct bio_integrity_payload *bip;
@@ -3171,8 +3305,8 @@ oom:
 	}
 
 	recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
-	if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
-		recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
+	if (ic->internal_hash_digestsize > ic->tuple_size)
+		recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size;
 	recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
 	if (!recalc_tags) {
 		kfree(recalc_buffer);
@@ -3217,8 +3351,11 @@ next_chunk:
 
 	t = recalc_tags;
 	for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
+		void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
+		void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
+		unsigned ptr_offset = offset_in_page(ptr);
 		memset(t, 0, ic->tuple_size);
-		integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
+		integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t);
 		t += ic->tuple_size;
 	}
 
@@ -3270,6 +3407,7 @@ unlock_ret:
 free_ret:
 	kfree(recalc_buffer);
 	kfree(recalc_tags);
+	mempool_free(ahash_req, &ic->ahash_req_pool);
 }
 
 static void bitmap_block_work(struct work_struct *w)
@@ -4210,30 +4348,53 @@ nomem:
 	return -ENOMEM;
 }
 
-static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
-		   char *error_alg, char *error_key)
+static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash,
+		   struct alg_spec *a, char **error, char *error_alg, char *error_key)
 {
 	int r;
 
 	if (a->alg_string) {
-		*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
-		if (IS_ERR(*hash)) {
-			*error = error_alg;
-			r = PTR_ERR(*hash);
-			*hash = NULL;
-			return r;
-		}
-
-		if (a->key) {
-			r = crypto_shash_setkey(*hash, a->key, a->key_size);
-			if (r) {
+		if (shash) {
+			*shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
+			if (IS_ERR(*shash)) {
+				*shash = NULL;
+				goto try_ahash;
+			}
+			if (a->key) {
+				r = crypto_shash_setkey(*shash, a->key, a->key_size);
+				if (r) {
+					*error = error_key;
+					return r;
+				}
+			} else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) {
 				*error = error_key;
+				return -ENOKEY;
+			}
+			return 0;
+		}
+try_ahash:
+		if (ahash) {
+			*ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
+			if (IS_ERR(*ahash)) {
+				*error = error_alg;
+				r = PTR_ERR(*ahash);
+				*ahash = NULL;
 				return r;
 			}
-		} else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
-			*error = error_key;
-			return -ENOKEY;
+			if (a->key) {
+				r = crypto_ahash_setkey(*ahash, a->key, a->key_size);
+				if (r) {
+					*error = error_key;
+					return r;
+				}
+			} else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) {
+				*error = error_key;
+				return -ENOKEY;
+			}
+			return 0;
 		}
+		*error = error_alg;
+		return -ENOENT;
 	}
 
 	return 0;
@@ -4690,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
 		buffer_sectors = 1;
 	ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
 
-	r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
+	r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error,
 		    "Invalid internal hash", "Error setting internal hash key");
 	if (r)
 		goto bad;
+	if (ic->internal_shash) {
+		ic->internal_hash = true;
+		ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash);
+	}
+	if (ic->internal_ahash) {
+		ic->internal_hash = true;
+		ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash);
+		r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL,
+					      sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash));
+		if (r) {
+			ti->error = "Cannot allocate mempool";
+			goto bad;
+		}
+	}
 
-	r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
+	r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error,
 		    "Invalid journal mac", "Error setting journal mac key");
 	if (r)
 		goto bad;
@@ -4706,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
 			r = -EINVAL;
 			goto bad;
 		}
-		ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
+		ic->tag_size = ic->internal_hash_digestsize;
 	}
 	if (ic->tag_size > MAX_TAG_SIZE) {
 		ti->error = "Too big tag size";
@@ -5178,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
 	kvfree(ic->bbs);
 	if (ic->bufio)
 		dm_bufio_client_destroy(ic->bufio);
+	mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool);
+	mempool_exit(&ic->ahash_req_pool);
 	bioset_exit(&ic->recalc_bios);
 	bioset_exit(&ic->recheck_bios);
 	mempool_exit(&ic->recheck_pool);
@@ -5215,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti)
 	if (ic->sb)
 		free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
 
-	if (ic->internal_hash)
-		crypto_free_shash(ic->internal_hash);
+	if (ic->internal_shash)
+		crypto_free_shash(ic->internal_shash);
+	if (ic->internal_ahash)
+		crypto_free_ahash(ic->internal_ahash);
 	free_alg(&ic->internal_hash_alg);
 
 	if (ic->journal_crypt)
@@ -5233,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
 	.name			= "integrity",
-	.version		= {1, 13, 0},
+	.version		= {1, 14, 0},
 	.module			= THIS_MODULE,
 	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
 	.ctr			= dm_integrity_ctr,
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 679b07dee229..7bb7174f8f4f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc)
 	}
 
 	/*
-	 * Super sector should be writen in-order, otherwise the
+	 * Super sector should be written in-order, otherwise the
 	 * nr_entries could be rewritten incorrectly by an old bio.
 	 */
 	wait_for_completion_io(&lc->super_done);
diff --git a/drivers/md/dm-pcache/Kconfig b/drivers/md/dm-pcache/Kconfig
new file mode 100644
index 000000000000..0e251eca892e
--- /dev/null
+++ b/drivers/md/dm-pcache/Kconfig
@@ -0,0 +1,17 @@
+config DM_PCACHE
+	tristate "Persistent cache for Block Device (Experimental)"
+	depends on BLK_DEV_DM
+	depends on DEV_DAX
+	help
+	  PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory,
+	  DAX-enabled devices) as a high-performance cache layer in front of
+	  traditional block devices such as SSDs or HDDs.
+
+	  PCACHE is implemented as a kernel module that integrates with the block
+	  layer and supports direct access (DAX) to persistent memory for low-latency,
+	  byte-addressable caching.
+
+	  Note: This feature is experimental and should be tested thoroughly
+	  before use in production environments.
+
+	  If unsure, say 'N'.
diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile
new file mode 100644
index 000000000000..86776e4acad2
--- /dev/null
+++ b/drivers/md/dm-pcache/Makefile
@@ -0,0 +1,3 @@
+dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o
+
+obj-m += dm-pcache.o
diff --git a/drivers/md/dm-pcache/backing_dev.c b/drivers/md/dm-pcache/backing_dev.c
new file mode 100644
index 000000000000..7165fc0364bb
--- /dev/null
+++ b/drivers/md/dm-pcache/backing_dev.c
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/blkdev.h>
+
+#include "../dm-core.h"
+#include "pcache_internal.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+static struct kmem_cache *backing_req_cache;
+static struct kmem_cache *backing_bvec_cache;
+
+static void backing_dev_exit(struct pcache_backing_dev *backing_dev)
+{
+	mempool_exit(&backing_dev->req_pool);
+	mempool_exit(&backing_dev->bvec_pool);
+}
+
+static void req_submit_fn(struct work_struct *work);
+static void req_complete_fn(struct work_struct *work);
+static int backing_dev_init(struct dm_pcache *pcache)
+{
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+	int ret;
+
+	ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache);
+	if (ret)
+		goto err;
+
+	ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache);
+	if (ret)
+		goto req_pool_exit;
+
+	INIT_LIST_HEAD(&backing_dev->submit_list);
+	INIT_LIST_HEAD(&backing_dev->complete_list);
+	spin_lock_init(&backing_dev->submit_lock);
+	spin_lock_init(&backing_dev->complete_lock);
+	INIT_WORK(&backing_dev->req_submit_work, req_submit_fn);
+	INIT_WORK(&backing_dev->req_complete_work, req_complete_fn);
+	atomic_set(&backing_dev->inflight_reqs, 0);
+	init_waitqueue_head(&backing_dev->inflight_wq);
+
+	return 0;
+
+req_pool_exit:
+	mempool_exit(&backing_dev->req_pool);
+err:
+	return ret;
+}
+
+int backing_dev_start(struct dm_pcache *pcache)
+{
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+	int ret;
+
+	ret = backing_dev_init(pcache);
+	if (ret)
+		return ret;
+
+	backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev);
+
+	return 0;
+}
+
+void backing_dev_stop(struct dm_pcache *pcache)
+{
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+
+	/*
+	 * There should not be any new request comming, just wait
+	 * inflight requests done.
+	 */
+	wait_event(backing_dev->inflight_wq,
+			atomic_read(&backing_dev->inflight_reqs) == 0);
+
+	flush_work(&backing_dev->req_submit_work);
+	flush_work(&backing_dev->req_complete_work);
+
+	backing_dev_exit(backing_dev);
+}
+
+/* pcache_backing_dev_req functions */
+void backing_dev_req_end(struct pcache_backing_dev_req *backing_req)
+{
+	struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+
+	if (backing_req->end_req)
+		backing_req->end_req(backing_req, backing_req->ret);
+
+	switch (backing_req->type) {
+	case BACKING_DEV_REQ_TYPE_REQ:
+		if (backing_req->req.upper_req)
+			pcache_req_put(backing_req->req.upper_req, backing_req->ret);
+		break;
+	case BACKING_DEV_REQ_TYPE_KMEM:
+		if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs)
+			mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool);
+		break;
+	default:
+		BUG();
+	}
+
+	mempool_free(backing_req, &backing_dev->req_pool);
+
+	if (atomic_dec_and_test(&backing_dev->inflight_reqs))
+		wake_up(&backing_dev->inflight_wq);
+}
+
+static void req_complete_fn(struct work_struct *work)
+{
+	struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work);
+	struct pcache_backing_dev_req *backing_req;
+	LIST_HEAD(tmp_list);
+
+	spin_lock_irq(&backing_dev->complete_lock);
+	list_splice_init(&backing_dev->complete_list, &tmp_list);
+	spin_unlock_irq(&backing_dev->complete_lock);
+
+	while (!list_empty(&tmp_list)) {
+		backing_req = list_first_entry(&tmp_list,
+					    struct pcache_backing_dev_req, node);
+		list_del_init(&backing_req->node);
+		backing_dev_req_end(backing_req);
+	}
+}
+
+static void backing_dev_bio_end(struct bio *bio)
+{
+	struct pcache_backing_dev_req *backing_req = bio->bi_private;
+	struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+	unsigned long flags;
+
+	backing_req->ret = blk_status_to_errno(bio->bi_status);
+
+	spin_lock_irqsave(&backing_dev->complete_lock, flags);
+	list_move_tail(&backing_req->node, &backing_dev->complete_list);
+	queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work);
+	spin_unlock_irqrestore(&backing_dev->complete_lock, flags);
+}
+
+static void req_submit_fn(struct work_struct *work)
+{
+	struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work);
+	struct pcache_backing_dev_req *backing_req;
+	LIST_HEAD(tmp_list);
+
+	spin_lock(&backing_dev->submit_lock);
+	list_splice_init(&backing_dev->submit_list, &tmp_list);
+	spin_unlock(&backing_dev->submit_lock);
+
+	while (!list_empty(&tmp_list)) {
+		backing_req = list_first_entry(&tmp_list,
+					    struct pcache_backing_dev_req, node);
+		list_del_init(&backing_req->node);
+		submit_bio_noacct(&backing_req->bio);
+	}
+}
+
+void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct)
+{
+	struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+
+	if (direct) {
+		submit_bio_noacct(&backing_req->bio);
+		return;
+	}
+
+	spin_lock(&backing_dev->submit_lock);
+	list_add_tail(&backing_req->node, &backing_dev->submit_list);
+	queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work);
+	spin_unlock(&backing_dev->submit_lock);
+}
+
+static void bio_map(struct bio *bio, void *base, size_t size)
+{
+	struct page *page;
+	unsigned int offset;
+	unsigned int len;
+
+	if (!is_vmalloc_addr(base)) {
+		page = virt_to_page(base);
+		offset = offset_in_page(base);
+
+		BUG_ON(!bio_add_page(bio, page, size, offset));
+		return;
+	}
+
+	flush_kernel_vmap_range(base, size);
+	while (size) {
+		page = vmalloc_to_page(base);
+		offset = offset_in_page(base);
+		len = min_t(size_t, PAGE_SIZE - offset, size);
+
+		BUG_ON(!bio_add_page(bio, page, len, offset));
+		size -= len;
+		base += len;
+	}
+}
+
+static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev,
+							struct pcache_backing_dev_req_opts *opts)
+{
+	struct pcache_request *pcache_req = opts->req.upper_req;
+	struct pcache_backing_dev_req *backing_req;
+	struct bio *orig = pcache_req->bio;
+
+	backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
+	if (!backing_req)
+		return NULL;
+
+	memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
+
+	bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask);
+
+	backing_req->type = BACKING_DEV_REQ_TYPE_REQ;
+	backing_req->backing_dev = backing_dev;
+	atomic_inc(&backing_dev->inflight_reqs);
+
+	return backing_req;
+}
+
+static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev,
+						struct pcache_backing_dev_req_opts *opts)
+{
+	struct pcache_backing_dev_req *backing_req;
+	u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len);
+
+	backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
+	if (!backing_req)
+		return NULL;
+
+	memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
+
+	if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) {
+		backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask);
+		if (!backing_req->kmem.bvecs)
+			goto free_backing_req;
+	} else {
+		backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs;
+	}
+
+	backing_req->kmem.n_vecs = n_vecs;
+	backing_req->type = BACKING_DEV_REQ_TYPE_KMEM;
+	backing_req->backing_dev = backing_dev;
+	atomic_inc(&backing_dev->inflight_reqs);
+
+	return backing_req;
+
+free_backing_req:
+	mempool_free(backing_req, &backing_dev->req_pool);
+	return NULL;
+}
+
+struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
+						struct pcache_backing_dev_req_opts *opts)
+{
+	if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
+		return req_type_req_alloc(backing_dev, opts);
+
+	if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
+		return kmem_type_req_alloc(backing_dev, opts);
+
+	BUG();
+}
+
+static void req_type_req_init(struct pcache_backing_dev_req *backing_req,
+			struct pcache_backing_dev_req_opts *opts)
+{
+	struct pcache_request *pcache_req = opts->req.upper_req;
+	struct bio *clone;
+	u32 off = opts->req.req_off;
+	u32 len = opts->req.len;
+
+	clone = &backing_req->bio;
+	BUG_ON(off & SECTOR_MASK);
+	BUG_ON(len & SECTOR_MASK);
+	bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT);
+
+	clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT;
+	clone->bi_private = backing_req;
+	clone->bi_end_io = backing_dev_bio_end;
+
+	INIT_LIST_HEAD(&backing_req->node);
+	backing_req->end_req     = opts->end_fn;
+
+	pcache_req_get(pcache_req);
+	backing_req->req.upper_req	= pcache_req;
+	backing_req->req.bio_off	= off;
+}
+
+static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req,
+			struct pcache_backing_dev_req_opts *opts)
+{
+	struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
+	struct bio *backing_bio;
+
+	bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs,
+			backing_req->kmem.n_vecs, opts->kmem.opf);
+
+	backing_bio = &backing_req->bio;
+	bio_map(backing_bio, opts->kmem.data, opts->kmem.len);
+
+	backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT;
+	backing_bio->bi_private = backing_req;
+	backing_bio->bi_end_io = backing_dev_bio_end;
+
+	INIT_LIST_HEAD(&backing_req->node);
+	backing_req->end_req	= opts->end_fn;
+	backing_req->priv_data	= opts->priv_data;
+}
+
+void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
+			struct pcache_backing_dev_req_opts *opts)
+{
+	if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
+		return req_type_req_init(backing_req, opts);
+
+	if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
+		return kmem_type_req_init(backing_req, opts);
+
+	BUG();
+}
+
+struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
+						struct pcache_backing_dev_req_opts *opts)
+{
+	struct pcache_backing_dev_req *backing_req;
+
+	backing_req = backing_dev_req_alloc(backing_dev, opts);
+	if (!backing_req)
+		return NULL;
+
+	backing_dev_req_init(backing_req, opts);
+
+	return backing_req;
+}
+
+void backing_dev_flush(struct pcache_backing_dev *backing_dev)
+{
+	blkdev_issue_flush(backing_dev->dm_dev->bdev);
+}
+
+int pcache_backing_init(void)
+{
+	u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1;
+	int ret;
+
+	backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0);
+	if (!backing_req_cache) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	backing_bvec_cache = kmem_cache_create("pcache-bvec-slab",
+					max_bvecs * sizeof(struct bio_vec),
+					0, 0, NULL);
+	if (!backing_bvec_cache) {
+		ret = -ENOMEM;
+		goto destroy_req_cache;
+	}
+
+	return 0;
+destroy_req_cache:
+	kmem_cache_destroy(backing_req_cache);
+err:
+	return ret;
+}
+
+void pcache_backing_exit(void)
+{
+	kmem_cache_destroy(backing_bvec_cache);
+	kmem_cache_destroy(backing_req_cache);
+}
diff --git a/drivers/md/dm-pcache/backing_dev.h b/drivers/md/dm-pcache/backing_dev.h
new file mode 100644
index 000000000000..b371cba483b9
--- /dev/null
+++ b/drivers/md/dm-pcache/backing_dev.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _BACKING_DEV_H
+#define _BACKING_DEV_H
+
+#include <linux/device-mapper.h>
+
+#include "pcache_internal.h"
+
+struct pcache_backing_dev_req;
+typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret);
+
+#define BACKING_DEV_REQ_TYPE_REQ		1
+#define BACKING_DEV_REQ_TYPE_KMEM		2
+
+#define BACKING_DEV_REQ_INLINE_BVECS		4
+
+struct pcache_request;
+struct pcache_backing_dev_req {
+	u8				type;
+	struct bio			bio;
+	struct pcache_backing_dev	*backing_dev;
+
+	void				*priv_data;
+	backing_req_end_fn_t		end_req;
+
+	struct list_head		node;
+	int				ret;
+
+	union {
+		struct {
+			struct pcache_request		*upper_req;
+			u32				bio_off;
+		} req;
+		struct {
+			struct bio_vec	inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS];
+			struct bio_vec	*bvecs;
+			u32		n_vecs;
+		} kmem;
+	};
+};
+
+struct pcache_backing_dev {
+	struct pcache_cache		*cache;
+
+	struct dm_dev			*dm_dev;
+	mempool_t			req_pool;
+	mempool_t			bvec_pool;
+
+	struct list_head		submit_list;
+	spinlock_t			submit_lock;
+	struct work_struct		req_submit_work;
+
+	struct list_head		complete_list;
+	spinlock_t			complete_lock;
+	struct work_struct		req_complete_work;
+
+	atomic_t			inflight_reqs;
+	wait_queue_head_t		inflight_wq;
+
+	u64				dev_size;
+};
+
+struct dm_pcache;
+int backing_dev_start(struct dm_pcache *pcache);
+void backing_dev_stop(struct dm_pcache *pcache);
+
+struct pcache_backing_dev_req_opts {
+	u32 type;
+	union {
+		struct {
+			struct pcache_request *upper_req;
+			u32 req_off;
+			u32 len;
+		} req;
+		struct {
+			void *data;
+			blk_opf_t opf;
+			u32 len;
+			u64 backing_off;
+		} kmem;
+	};
+
+	gfp_t gfp_mask;
+	backing_req_end_fn_t	end_fn;
+	void			*priv_data;
+};
+
+static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len)
+{
+	const void *p = data;
+	u32 done = 0, in_page, to_advance;
+	struct page *first_page, *next_page;
+
+	if (!is_vmalloc_addr(data))
+		return len;
+
+	first_page = vmalloc_to_page(p);
+advance:
+	in_page = PAGE_SIZE - offset_in_page(p);
+	to_advance = min_t(u32, in_page, len - done);
+
+	done += to_advance;
+	p += to_advance;
+
+	if (done == len)
+		return done;
+
+	next_page = vmalloc_to_page(p);
+	if (zone_device_pages_have_same_pgmap(first_page, next_page))
+		goto advance;
+
+	return done;
+}
+
+void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct);
+void backing_dev_req_end(struct pcache_backing_dev_req *backing_req);
+struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
+						struct pcache_backing_dev_req_opts *opts);
+struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
+						struct pcache_backing_dev_req_opts *opts);
+void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
+			struct pcache_backing_dev_req_opts *opts);
+void backing_dev_flush(struct pcache_backing_dev *backing_dev);
+
+int pcache_backing_init(void);
+void pcache_backing_exit(void);
+#endif /* _BACKING_DEV_H */
diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c
new file mode 100644
index 000000000000..d8e92367d947
--- /dev/null
+++ b/drivers/md/dm-pcache/cache.c
@@ -0,0 +1,445 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/blk_types.h>
+
+#include "cache.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "dm_pcache.h"
+
+struct kmem_cache *key_cache;
+
+static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache)
+{
+	return cache->cache_info_addr + cache->info_index;
+}
+
+static void cache_info_write(struct pcache_cache *cache)
+{
+	struct pcache_cache_info *cache_info = &cache->cache_info;
+
+	cache_info->header.seq++;
+	cache_info->header.crc = pcache_meta_crc(&cache_info->header,
+						sizeof(struct pcache_cache_info));
+
+	memcpy_flushcache(get_cache_info_addr(cache), cache_info,
+			sizeof(struct pcache_cache_info));
+
+	cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+static void cache_info_init_default(struct pcache_cache *cache);
+static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_info *cache_info_addr;
+
+	cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header,
+						sizeof(struct pcache_cache_info),
+						PCACHE_CACHE_INFO_SIZE,
+						&cache->cache_info);
+	if (IS_ERR(cache_info_addr))
+		return PTR_ERR(cache_info_addr);
+
+	if (cache_info_addr) {
+		if (opts->data_crc !=
+				(cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) {
+			pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s",
+					opts->data_crc ? "true" : "false",
+					cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false");
+			return -EINVAL;
+		}
+
+		return 0;
+	}
+
+	/* init cache_info for new cache */
+	cache_info_init_default(cache);
+	cache_mode_set(cache, opts->cache_mode);
+	if (opts->data_crc)
+		cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC;
+
+	return 0;
+}
+
+static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent)
+{
+	cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK;
+	cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent);
+}
+
+int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent)
+{
+	if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN)
+		return -EINVAL;
+
+	mutex_lock(&cache->cache_info_lock);
+	cache_info_set_gc_percent(&cache->cache_info, percent);
+
+	cache_info_write(cache);
+	mutex_unlock(&cache->cache_info_lock);
+
+	return 0;
+}
+
+void cache_pos_encode(struct pcache_cache *cache,
+			     struct pcache_cache_pos_onmedia *pos_onmedia_base,
+			     struct pcache_cache_pos *pos, u64 seq, u32 *index)
+{
+	struct pcache_cache_pos_onmedia pos_onmedia;
+	struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index;
+
+	pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id;
+	pos_onmedia.seg_off = pos->seg_off;
+	pos_onmedia.header.seq = seq;
+	pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia);
+
+	memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia));
+	pmem_wmb();
+
+	*index = (*index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+int cache_pos_decode(struct pcache_cache *cache,
+			    struct pcache_cache_pos_onmedia *pos_onmedia,
+			    struct pcache_cache_pos *pos, u64 *seq, u32 *index)
+{
+	struct pcache_cache_pos_onmedia latest, *latest_addr;
+
+	latest_addr = pcache_meta_find_latest(&pos_onmedia->header,
+					sizeof(struct pcache_cache_pos_onmedia),
+					sizeof(struct pcache_cache_pos_onmedia),
+					&latest);
+	if (IS_ERR(latest_addr))
+		return PTR_ERR(latest_addr);
+
+	if (!latest_addr)
+		return -EIO;
+
+	pos->cache_seg = &cache->segments[latest.cache_seg_id];
+	pos->seg_off = latest.seg_off;
+	*seq = latest.header.seq;
+	*index = (latest_addr - pos_onmedia);
+
+	return 0;
+}
+
+static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id)
+{
+	cache->cache_info.seg_id = seg_id;
+}
+
+static int cache_init(struct dm_pcache *pcache)
+{
+	struct pcache_cache *cache = &pcache->cache;
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+	struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+	int ret;
+
+	cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL);
+	if (!cache->segments) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
+	if (!cache->seg_map) {
+		ret = -ENOMEM;
+		goto free_segments;
+	}
+
+	cache->backing_dev = backing_dev;
+	cache->cache_dev = &pcache->cache_dev;
+	cache->n_segs = cache_dev->seg_num;
+	atomic_set(&cache->gc_errors, 0);
+	spin_lock_init(&cache->seg_map_lock);
+	spin_lock_init(&cache->key_head_lock);
+
+	mutex_init(&cache->cache_info_lock);
+	mutex_init(&cache->key_tail_lock);
+	mutex_init(&cache->dirty_tail_lock);
+	mutex_init(&cache->writeback_lock);
+
+	INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn);
+	INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn);
+	INIT_WORK(&cache->clean_work, clean_fn);
+
+	return 0;
+
+free_segments:
+	kvfree(cache->segments);
+err:
+	return ret;
+}
+
+static void cache_exit(struct pcache_cache *cache)
+{
+	kvfree(cache->seg_map);
+	kvfree(cache->segments);
+}
+
+static void cache_info_init_default(struct pcache_cache *cache)
+{
+	struct pcache_cache_info *cache_info = &cache->cache_info;
+
+	cache_info->header.seq = 0;
+	cache_info->n_segs = cache->cache_dev->seg_num;
+	cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT);
+}
+
+static int cache_tail_init(struct pcache_cache *cache)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
+
+	if (new_cache) {
+		__set_bit(0, cache->seg_map);
+
+		cache->key_head.cache_seg = &cache->segments[0];
+		cache->key_head.seg_off = 0;
+		cache_pos_copy(&cache->key_tail, &cache->key_head);
+		cache_pos_copy(&cache->dirty_tail, &cache->key_head);
+
+		cache_encode_dirty_tail(cache);
+		cache_encode_key_tail(cache);
+	} else {
+		if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) {
+			pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n");
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static int get_seg_id(struct pcache_cache *cache,
+		      struct pcache_cache_segment *prev_cache_seg,
+		      bool new_cache, u32 *seg_id)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_dev *cache_dev = cache->cache_dev;
+	int ret;
+
+	if (new_cache) {
+		ret = cache_dev_get_empty_segment_id(cache_dev, seg_id);
+		if (ret) {
+			pcache_dev_err(pcache, "no available segment\n");
+			goto err;
+		}
+
+		if (prev_cache_seg)
+			cache_seg_set_next_seg(prev_cache_seg, *seg_id);
+		else
+			cache_info_set_seg_id(cache, *seg_id);
+	} else {
+		if (prev_cache_seg) {
+			struct pcache_segment_info *prev_seg_info;
+
+			prev_seg_info = &prev_cache_seg->cache_seg_info;
+			if (!segment_info_has_next(prev_seg_info)) {
+				ret = -EFAULT;
+				goto err;
+			}
+			*seg_id = prev_cache_seg->cache_seg_info.next_seg;
+		} else {
+			*seg_id = cache->cache_info.seg_id;
+		}
+	}
+	return 0;
+err:
+	return ret;
+}
+
+static int cache_segs_init(struct pcache_cache *cache)
+{
+	struct pcache_cache_segment *prev_cache_seg = NULL;
+	struct pcache_cache_info *cache_info = &cache->cache_info;
+	bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
+	u32 seg_id;
+	int ret;
+	u32 i;
+
+	for (i = 0; i < cache_info->n_segs; i++) {
+		ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id);
+		if (ret)
+			goto err;
+
+		ret = cache_seg_init(cache, seg_id, i, new_cache);
+		if (ret)
+			goto err;
+
+		prev_cache_seg = &cache->segments[i];
+	}
+	return 0;
+err:
+	return ret;
+}
+
+static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	u32 n_subtrees;
+	int ret;
+	u32 i, cpu;
+
+	/* Calculate number of cache trees based on the device size */
+	n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE);
+	ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees);
+	if (ret)
+		goto err;
+
+	cache->n_ksets = n_paral;
+	cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL);
+	if (!cache->ksets) {
+		ret = -ENOMEM;
+		goto req_tree_exit;
+	}
+
+	/*
+	 * Initialize each kset with a spinlock and delayed work for flushing.
+	 * Each kset is associated with one queue to ensure independent handling
+	 * of cache keys across multiple queues, maximizing multiqueue concurrency.
+	 */
+	for (i = 0; i < cache->n_ksets; i++) {
+		struct pcache_cache_kset *kset = get_kset(cache, i);
+
+		kset->cache = cache;
+		spin_lock_init(&kset->kset_lock);
+		INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn);
+	}
+
+	cache->data_heads = alloc_percpu(struct pcache_cache_data_head);
+	if (!cache->data_heads) {
+		ret = -ENOMEM;
+		goto free_kset;
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct pcache_cache_data_head *h =
+			per_cpu_ptr(cache->data_heads, cpu);
+		h->head_pos.cache_seg = NULL;
+	}
+
+	/*
+	 * Replay persisted cache keys using cache_replay.
+	 * This function loads and replays cache keys from previously stored
+	 * ksets, allowing the cache to restore its state after a restart.
+	 */
+	ret = cache_replay(cache);
+	if (ret) {
+		pcache_dev_err(pcache, "failed to replay keys\n");
+		goto free_heads;
+	}
+
+	return 0;
+
+free_heads:
+	free_percpu(cache->data_heads);
+free_kset:
+	kvfree(cache->ksets);
+req_tree_exit:
+	cache_tree_exit(&cache->req_key_tree);
+err:
+	return ret;
+}
+
+static void cache_destroy_req_keys(struct pcache_cache *cache)
+{
+	u32 i;
+
+	for (i = 0; i < cache->n_ksets; i++) {
+		struct pcache_cache_kset *kset = get_kset(cache, i);
+
+		cancel_delayed_work_sync(&kset->flush_work);
+	}
+
+	free_percpu(cache->data_heads);
+	kvfree(cache->ksets);
+	cache_tree_exit(&cache->req_key_tree);
+}
+
+int pcache_cache_start(struct dm_pcache *pcache)
+{
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+	struct pcache_cache *cache = &pcache->cache;
+	struct pcache_cache_options *opts = &pcache->opts;
+	int ret;
+
+	ret = cache_init(pcache);
+	if (ret)
+		return ret;
+
+	cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev);
+	cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev);
+	backing_dev->cache = cache;
+	cache->dev_size = backing_dev->dev_size;
+
+	ret = cache_info_init(cache, opts);
+	if (ret)
+		goto cache_exit;
+
+	ret = cache_segs_init(cache);
+	if (ret)
+		goto cache_exit;
+
+	ret = cache_tail_init(cache);
+	if (ret)
+		goto cache_exit;
+
+	ret = cache_init_req_keys(cache, num_online_cpus());
+	if (ret)
+		goto cache_exit;
+
+	ret = cache_writeback_init(cache);
+	if (ret)
+		goto destroy_keys;
+
+	cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE;
+	cache_info_write(cache);
+	queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0);
+
+	return 0;
+
+destroy_keys:
+	cache_destroy_req_keys(cache);
+cache_exit:
+	cache_exit(cache);
+
+	return ret;
+}
+
+void pcache_cache_stop(struct dm_pcache *pcache)
+{
+	struct pcache_cache *cache = &pcache->cache;
+
+	cache_flush(cache);
+
+	cancel_delayed_work_sync(&cache->gc_work);
+	flush_work(&cache->clean_work);
+	cache_writeback_exit(cache);
+
+	if (cache->req_key_tree.n_subtrees)
+		cache_destroy_req_keys(cache);
+
+	cache_exit(cache);
+}
+
+struct workqueue_struct *cache_get_wq(struct pcache_cache *cache)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+
+	return pcache->task_wq;
+}
+
+int pcache_cache_init(void)
+{
+	key_cache = KMEM_CACHE(pcache_cache_key, 0);
+	if (!key_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void pcache_cache_exit(void)
+{
+	kmem_cache_destroy(key_cache);
+}
diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h
new file mode 100644
index 000000000000..1136d86958c8
--- /dev/null
+++ b/drivers/md/dm-pcache/cache.h
@@ -0,0 +1,635 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_CACHE_H
+#define _PCACHE_CACHE_H
+
+#include "segment.h"
+
+/* Garbage collection thresholds */
+#define PCACHE_CACHE_GC_PERCENT_MIN       0                   /* Minimum GC percentage */
+#define PCACHE_CACHE_GC_PERCENT_MAX       90                  /* Maximum GC percentage */
+#define PCACHE_CACHE_GC_PERCENT_DEFAULT   70                  /* Default GC percentage */
+
+#define PCACHE_CACHE_SUBTREE_SIZE		(4 * PCACHE_MB)     /* 4MB total tree size */
+#define PCACHE_CACHE_SUBTREE_SIZE_MASK		0x3FFFFF            /* Mask for tree size */
+#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT		22                  /* Bit shift for tree size */
+
+/* Maximum number of keys per key set */
+#define PCACHE_KSET_KEYS_MAX		128
+#define PCACHE_CACHE_SEGS_MAX		(1024 * 1024)	/* maximum cache size for each device is 16T */
+#define PCACHE_KSET_ONMEDIA_SIZE_MAX	struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX)
+#define PCACHE_KSET_SIZE		(sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX)
+
+/* Maximum number of keys to clean in one round of clean_work */
+#define PCACHE_CLEAN_KEYS_MAX             10
+
+/* Writeback and garbage collection intervals in jiffies */
+#define PCACHE_CACHE_WRITEBACK_INTERVAL   (5 * HZ)
+#define PCACHE_CACHE_GC_INTERVAL          (5 * HZ)
+
+/* Macro to get the cache key structure from an rb_node pointer */
+#define CACHE_KEY(node)                (container_of(node, struct pcache_cache_key, rb_node))
+
+struct pcache_cache_pos_onmedia {
+	struct pcache_meta_header header;
+	__u32 cache_seg_id;
+	__u32 seg_off;
+};
+
+/* Offset and size definitions for cache segment control */
+#define PCACHE_CACHE_SEG_CTRL_OFF     (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX)
+#define PCACHE_CACHE_SEG_CTRL_SIZE    (4 * PCACHE_KB)
+
+struct pcache_cache_seg_gen {
+	struct pcache_meta_header header;
+	__u64 gen;
+};
+
+/* Control structure for cache segments */
+struct pcache_cache_seg_ctrl {
+	struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX];
+	__u64	res[64];
+};
+
+#define PCACHE_CACHE_FLAGS_DATA_CRC			BIT(0)
+#define PCACHE_CACHE_FLAGS_INIT_DONE			BIT(1)
+
+#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK		GENMASK(5, 2)
+#define PCACHE_CACHE_MODE_WRITEBACK			0
+#define PCACHE_CACHE_MODE_WRITETHROUGH			1
+#define PCACHE_CACHE_MODE_WRITEAROUND			2
+#define PCACHE_CACHE_MODE_WRITEONLY			3
+
+#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK		GENMASK(12, 6)
+
+struct pcache_cache_info {
+	struct pcache_meta_header header;
+	__u32 seg_id;
+	__u32 n_segs;
+	__u32 flags;
+	__u32 reserved;
+};
+
+struct pcache_cache_pos {
+	struct pcache_cache_segment *cache_seg;
+	u32 seg_off;
+};
+
+struct pcache_cache_segment {
+	struct pcache_cache	*cache;
+	u32			cache_seg_id;   /* Index in cache->segments */
+	struct pcache_segment	segment;
+	atomic_t		refs;
+
+	struct pcache_segment_info cache_seg_info;
+	struct mutex		info_lock;
+	u32			info_index;
+
+	spinlock_t		gen_lock;
+	u64			gen;
+	u64			gen_seq;
+	u32			gen_index;
+
+	struct pcache_cache_seg_ctrl *cache_seg_ctrl;
+};
+
+/* rbtree for cache entries */
+struct pcache_cache_subtree {
+	struct rb_root root;
+	spinlock_t tree_lock;
+};
+
+struct pcache_cache_tree {
+	struct pcache_cache		*cache;
+	u32				n_subtrees;
+	mempool_t			key_pool;
+	struct pcache_cache_subtree	*subtrees;
+};
+
+extern struct kmem_cache *key_cache;
+
+struct pcache_cache_key {
+	struct pcache_cache_tree	*cache_tree;
+	struct pcache_cache_subtree	*cache_subtree;
+	struct kref			ref;
+	struct rb_node			rb_node;
+	struct list_head		list_node;
+	u64				off;
+	u32				len;
+	u32				flags;
+	struct pcache_cache_pos		cache_pos;
+	u64				seg_gen;
+};
+
+#define PCACHE_CACHE_KEY_FLAGS_EMPTY		BIT(0)
+#define PCACHE_CACHE_KEY_FLAGS_CLEAN		BIT(1)
+
+struct pcache_cache_key_onmedia {
+	__u64 off;
+	__u32 len;
+	__u32 flags;
+	__u32 cache_seg_id;
+	__u32 cache_seg_off;
+	__u64 seg_gen;
+	__u32 data_crc;
+	__u32 reserved;
+};
+
+struct pcache_cache_kset_onmedia {
+	__u32 crc;
+	union {
+		__u32 key_num;
+		__u32 next_cache_seg_id;
+	};
+	__u64 magic;
+	__u64 flags;
+	struct pcache_cache_key_onmedia data[];
+};
+
+struct pcache_cache {
+	struct pcache_backing_dev	*backing_dev;
+	struct pcache_cache_dev		*cache_dev;
+	struct pcache_cache_ctrl	*cache_ctrl;
+	u64				dev_size;
+
+	struct pcache_cache_data_head __percpu *data_heads;
+
+	spinlock_t		key_head_lock;
+	struct pcache_cache_pos	key_head;
+	u32			n_ksets;
+	struct pcache_cache_kset	*ksets;
+
+	struct mutex		key_tail_lock;
+	struct pcache_cache_pos	key_tail;
+	u64			key_tail_seq;
+	u32			key_tail_index;
+
+	struct mutex		dirty_tail_lock;
+	struct pcache_cache_pos	dirty_tail;
+	u64			dirty_tail_seq;
+	u32			dirty_tail_index;
+
+	struct pcache_cache_tree	req_key_tree;
+	struct work_struct	clean_work;
+
+	struct mutex		writeback_lock;
+	char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
+	struct pcache_cache_tree	writeback_key_tree;
+	struct delayed_work	writeback_work;
+	struct {
+		atomic_t pending;
+		u32 advance;
+		int ret;
+	} writeback_ctx;
+
+	char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
+	struct delayed_work	gc_work;
+	atomic_t		gc_errors;
+
+	struct mutex			cache_info_lock;
+	struct pcache_cache_info	cache_info;
+	struct pcache_cache_info	*cache_info_addr;
+	u32				info_index;
+
+	u32			n_segs;
+	unsigned long		*seg_map;
+	u32			last_cache_seg;
+	bool			cache_full;
+	spinlock_t		seg_map_lock;
+	struct pcache_cache_segment *segments;
+};
+
+struct workqueue_struct *cache_get_wq(struct pcache_cache *cache);
+
+struct dm_pcache;
+struct pcache_cache_options {
+	u32	cache_mode:4;
+	u32	data_crc:1;
+};
+int pcache_cache_start(struct dm_pcache *pcache);
+void pcache_cache_stop(struct dm_pcache *pcache);
+
+struct pcache_cache_ctrl {
+	/* Updated by gc_thread */
+	struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX];
+
+	/* Updated by writeback_thread */
+	struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX];
+};
+
+struct pcache_cache_data_head {
+	struct pcache_cache_pos head_pos;
+};
+
+static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache)
+{
+	return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags);
+}
+
+int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent);
+
+/* cache key */
+struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask);
+void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key);
+void cache_key_get(struct pcache_cache_key *key);
+void cache_key_put(struct pcache_cache_key *key);
+int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close);
+void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup);
+int cache_key_decode(struct pcache_cache *cache,
+			struct pcache_cache_key_onmedia *key_onmedia,
+			struct pcache_cache_key *key);
+void cache_pos_advance(struct pcache_cache_pos *pos, u32 len);
+
+#define PCACHE_KSET_FLAGS_LAST		BIT(0)
+#define PCACHE_KSET_MAGIC		0x676894a64e164f1aULL
+
+struct pcache_cache_kset {
+	struct pcache_cache *cache;
+	spinlock_t        kset_lock;
+	struct delayed_work flush_work;
+	struct pcache_cache_kset_onmedia kset_onmedia;
+};
+
+extern struct pcache_cache_kset_onmedia pcache_empty_kset;
+
+#define SUBTREE_WALK_RET_OK		0
+#define SUBTREE_WALK_RET_ERR		1
+#define SUBTREE_WALK_RET_NEED_KEY	2
+#define SUBTREE_WALK_RET_NEED_REQ	3
+#define SUBTREE_WALK_RET_RESEARCH	4
+
+struct pcache_cache_subtree_walk_ctx {
+	struct pcache_cache_tree *cache_tree;
+	struct rb_node *start_node;
+	struct pcache_request *pcache_req;
+	struct pcache_cache_key *key;
+	u32	req_done;
+	int	ret;
+
+	/* pre-allocated key and backing_dev_req */
+	struct pcache_cache_key		*pre_alloc_key;
+	struct pcache_backing_dev_req	*pre_alloc_req;
+
+	struct list_head *delete_key_list;
+	struct list_head *submit_req_list;
+
+	/*
+	 *	  |--------|		key_tmp
+	 * |====|			key
+	 */
+	int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	/*
+	 * |----------|			key_tmp
+	 *		|=====|		key
+	 */
+	int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	/*
+	 *     |----------------|	key_tmp
+	 * |===========|		key
+	 */
+	int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	/*
+	 * |--------|			key_tmp
+	 *   |==========|		key
+	 */
+	int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	/*
+	 *    |----|			key_tmp
+	 * |==========|			key
+	 */
+	int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	/*
+	 * |-----------|		key_tmp
+	 *   |====|			key
+	 */
+	int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+			struct pcache_cache_subtree_walk_ctx *ctx);
+
+	int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret);
+	bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx);
+};
+
+int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx);
+struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
+				  struct rb_node **parentp, struct rb_node ***newp,
+				  struct list_head *delete_key_list);
+int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset);
+void clean_fn(struct work_struct *work);
+void kset_flush_fn(struct work_struct *work);
+int cache_replay(struct pcache_cache *cache);
+int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees);
+void cache_tree_clear(struct pcache_cache_tree *cache_tree);
+void cache_tree_exit(struct pcache_cache_tree *cache_tree);
+
+/* cache segments */
+struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache);
+int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
+		   bool new_cache);
+void cache_seg_get(struct pcache_cache_segment *cache_seg);
+void cache_seg_put(struct pcache_cache_segment *cache_seg);
+void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id);
+
+/* cache request*/
+int cache_flush(struct pcache_cache *cache);
+void miss_read_end_work_fn(struct work_struct *work);
+int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req);
+
+/* gc */
+void pcache_cache_gc_fn(struct work_struct *work);
+
+/* writeback */
+void cache_writeback_exit(struct pcache_cache *cache);
+int cache_writeback_init(struct pcache_cache *cache);
+void cache_writeback_fn(struct work_struct *work);
+
+/* inline functions */
+static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off)
+{
+	if (cache_tree->n_subtrees == 1)
+		return &cache_tree->subtrees[0];
+
+	return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT];
+}
+
+static inline void *cache_pos_addr(struct pcache_cache_pos *pos)
+{
+	return (pos->cache_seg->segment.data + pos->seg_off);
+}
+
+static inline void *get_key_head_addr(struct pcache_cache *cache)
+{
+	return cache_pos_addr(&cache->key_head);
+}
+
+static inline u32 get_kset_id(struct pcache_cache *cache, u64 off)
+{
+	u32 kset_id;
+
+	div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id);
+
+	return kset_id;
+}
+
+static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id)
+{
+	return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id;
+}
+
+static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache)
+{
+	return this_cpu_ptr(cache->data_heads);
+}
+
+static inline bool cache_key_empty(struct pcache_cache_key *key)
+{
+	return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY;
+}
+
+static inline bool cache_key_clean(struct pcache_cache_key *key)
+{
+	return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN;
+}
+
+static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src)
+{
+	memcpy(dst, src, sizeof(struct pcache_cache_pos));
+}
+
+/**
+ * cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment.
+ * @cache_seg_id: ID of the cache segment.
+ *
+ * Returns true if the cache segment ID corresponds to a cache ctrl segment.
+ *
+ * Note: We extend the segment control of the first cache segment
+ * (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl)
+ * for the entire PCACHE cache. This function determines whether the given
+ * cache segment is the one storing the pcache_cache_ctrl information.
+ */
+static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id)
+{
+	return (cache_seg_id == 0);
+}
+
+/**
+ * cache_key_cutfront - Cuts a specified length from the front of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ * @cut_len: Length to cut from the front.
+ *
+ * Advances the cache key position by cut_len and adjusts offset and length accordingly.
+ */
+static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len)
+{
+	if (key->cache_pos.cache_seg)
+		cache_pos_advance(&key->cache_pos, cut_len);
+
+	key->off += cut_len;
+	key->len -= cut_len;
+}
+
+/**
+ * cache_key_cutback - Cuts a specified length from the back of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ * @cut_len: Length to cut from the back.
+ *
+ * Reduces the length of the cache key by cut_len.
+ */
+static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len)
+{
+	key->len -= cut_len;
+}
+
+static inline void cache_key_delete(struct pcache_cache_key *key)
+{
+	struct pcache_cache_subtree *cache_subtree;
+
+	cache_subtree = key->cache_subtree;
+	BUG_ON(!cache_subtree);
+
+	rb_erase(&key->rb_node, &cache_subtree->root);
+	key->flags = 0;
+	cache_key_put(key);
+}
+
+static inline bool cache_data_crc_on(struct pcache_cache *cache)
+{
+	return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC);
+}
+
+static inline u32 cache_mode_get(struct pcache_cache *cache)
+{
+	return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags);
+}
+
+static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode)
+{
+	cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK;
+	cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode);
+}
+
+/**
+ * cache_key_data_crc - Calculates CRC for data in a cache key.
+ * @key: Pointer to the pcache_cache_key structure.
+ *
+ * Returns the CRC-32 checksum of the data within the cache key's position.
+ */
+static inline u32 cache_key_data_crc(struct pcache_cache_key *key)
+{
+	void *data;
+
+	data = cache_pos_addr(&key->cache_pos);
+
+	return crc32c(PCACHE_CRC_SEED, data, key->len);
+}
+
+static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+	u32 crc_size;
+
+	if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST)
+		crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4;
+	else
+		crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4;
+
+	return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size);
+}
+
+static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+	return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num);
+}
+
+/**
+ * cache_seg_remain - Computes remaining space in a cache segment.
+ * @pos: Pointer to pcache_cache_pos structure.
+ *
+ * Returns the amount of remaining space in the segment data starting from
+ * the current position offset.
+ */
+static inline u32 cache_seg_remain(struct pcache_cache_pos *pos)
+{
+	struct pcache_cache_segment *cache_seg;
+	struct pcache_segment *segment;
+	u32 seg_remain;
+
+	cache_seg = pos->cache_seg;
+	segment = &cache_seg->segment;
+	seg_remain = segment->data_size - pos->seg_off;
+
+	return seg_remain;
+}
+
+/**
+ * cache_key_invalid - Checks if a cache key is invalid.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns true if the cache key is invalid due to its generation being
+ * less than the generation of its segment; otherwise returns false.
+ *
+ * When the GC (garbage collection) thread identifies a segment
+ * as reclaimable, it increments the segment's generation (gen). However,
+ * it does not immediately remove all related cache keys. When accessing
+ * such a cache key, this function can be used to determine if the cache
+ * key has already become invalid.
+ */
+static inline bool cache_key_invalid(struct pcache_cache_key *key)
+{
+	if (cache_key_empty(key))
+		return false;
+
+	return (key->seg_gen < key->cache_pos.cache_seg->gen);
+}
+
+/**
+ * cache_key_lstart - Retrieves the logical start offset of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns the logical start offset for the cache key.
+ */
+static inline u64 cache_key_lstart(struct pcache_cache_key *key)
+{
+	return key->off;
+}
+
+/**
+ * cache_key_lend - Retrieves the logical end offset of a cache key.
+ * @key: Pointer to pcache_cache_key structure.
+ *
+ * Returns the logical end offset for the cache key.
+ */
+static inline u64 cache_key_lend(struct pcache_cache_key *key)
+{
+	return key->off + key->len;
+}
+
+static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src)
+{
+	key_dst->off = key_src->off;
+	key_dst->len = key_src->len;
+	key_dst->seg_gen = key_src->seg_gen;
+	key_dst->cache_tree = key_src->cache_tree;
+	key_dst->cache_subtree = key_src->cache_subtree;
+	key_dst->flags = key_src->flags;
+
+	cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos);
+}
+
+/**
+ * cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position.
+ * @pos_om: Pointer to pcache_cache_pos_onmedia structure.
+ *
+ * Calculates the CRC-32 checksum of the position, excluding the first 4 bytes.
+ * Returns the computed CRC value.
+ */
+static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om)
+{
+	return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia));
+}
+
+void cache_pos_encode(struct pcache_cache *cache,
+			     struct pcache_cache_pos_onmedia *pos_onmedia,
+			     struct pcache_cache_pos *pos, u64 seq, u32 *index);
+int cache_pos_decode(struct pcache_cache *cache,
+			    struct pcache_cache_pos_onmedia *pos_onmedia,
+			    struct pcache_cache_pos *pos, u64 *seq, u32 *index);
+
+static inline void cache_encode_key_tail(struct pcache_cache *cache)
+{
+	cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos,
+			&cache->key_tail, ++cache->key_tail_seq,
+			&cache->key_tail_index);
+}
+
+static inline int cache_decode_key_tail(struct pcache_cache *cache)
+{
+	return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos,
+				&cache->key_tail, &cache->key_tail_seq,
+				&cache->key_tail_index);
+}
+
+static inline void cache_encode_dirty_tail(struct pcache_cache *cache)
+{
+	cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos,
+			&cache->dirty_tail, ++cache->dirty_tail_seq,
+			&cache->dirty_tail_index);
+}
+
+static inline int cache_decode_dirty_tail(struct pcache_cache *cache)
+{
+	return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos,
+				&cache->dirty_tail, &cache->dirty_tail_seq,
+				&cache->dirty_tail_index);
+}
+
+int pcache_cache_init(void);
+void pcache_cache_exit(void);
+#endif /* _PCACHE_CACHE_H */
diff --git a/drivers/md/dm-pcache/cache_dev.c b/drivers/md/dm-pcache/cache_dev.c
new file mode 100644
index 000000000000..ece689e6ce59
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_dev.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/blkdev.h>
+#include <linux/dax.h>
+#include <linux/vmalloc.h>
+#include <linux/parser.h>
+
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev)
+{
+	if (cache_dev->use_vmap)
+		vunmap(cache_dev->mapping);
+}
+
+static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr)
+{
+	struct page **pages;
+	long i = 0, chunk;
+	unsigned long pfn;
+	int ret;
+
+	pages = vmalloc_array(total_pages, sizeof(struct page *));
+	if (!pages)
+		return -ENOMEM;
+
+	do {
+		chunk = dax_direct_access(dax_dev, i, total_pages - i,
+					  DAX_ACCESS, NULL, &pfn);
+		if (chunk <= 0) {
+			ret = chunk ? chunk : -EINVAL;
+			goto out_free;
+		}
+
+		if (!pfn_valid(pfn)) {
+			ret = -EOPNOTSUPP;
+			goto out_free;
+		}
+
+		while (chunk-- && i < total_pages) {
+			pages[i++] = pfn_to_page(pfn);
+			pfn++;
+			if (!(i & 15))
+				cond_resched();
+		}
+	} while (i < total_pages);
+
+	*vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL);
+	if (!*vaddr) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	ret = 0;
+
+out_free:
+	vfree(pages);
+	return ret;
+}
+
+static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev)
+{
+	struct dm_pcache	*pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+	struct dax_device	*dax_dev;
+	long			total_pages, mapped_pages;
+	u64			bdev_size;
+	void			*vaddr;
+	int			ret;
+	int			id;
+	unsigned long		pfn;
+
+	dax_dev	= cache_dev->dm_dev->dax_dev;
+	/* total size check */
+	bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev);
+	if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
+		pcache_dev_err(pcache, "dax device is too small, required at least %llu",
+				PCACHE_CACHE_DEV_SIZE_MIN);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	total_pages = bdev_size >> PAGE_SHIFT;
+	/* attempt: direct-map the whole range */
+	id = dax_read_lock();
+	mapped_pages = dax_direct_access(dax_dev, 0, total_pages,
+					 DAX_ACCESS, &vaddr, &pfn);
+	if (mapped_pages < 0) {
+		pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages);
+		ret = mapped_pages;
+		goto unlock;
+	}
+
+	if (!pfn_valid(pfn)) {
+		ret = -EOPNOTSUPP;
+		goto unlock;
+	}
+
+	if (mapped_pages == total_pages) {
+		/* success: contiguous direct mapping */
+		cache_dev->mapping = vaddr;
+	} else {
+		/* need vmap fallback */
+		ret = build_vmap(dax_dev, total_pages, &vaddr);
+		if (ret) {
+			pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret);
+			goto unlock;
+		}
+
+		cache_dev->mapping	= vaddr;
+		cache_dev->use_vmap	= true;
+	}
+	dax_read_unlock(id);
+
+	return 0;
+unlock:
+	dax_read_unlock(id);
+out:
+	return ret;
+}
+
+void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size)
+{
+	memset(pos, 0, size);
+	dax_flush(cache_dev->dm_dev->dax_dev, pos, size);
+}
+
+static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+	struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
+
+	if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb)))
+		return -EIO;
+
+	return 0;
+}
+
+static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+	struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
+
+	memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb));
+	pmem_wmb();
+}
+
+static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+	struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+	u64 nr_segs;
+	u64 cache_dev_size;
+	u64 magic;
+	u32 flags = 0;
+
+	magic = le64_to_cpu(sb->magic);
+	if (magic)
+		return -EEXIST;
+
+	cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file));
+	if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
+		pcache_dev_err(pcache, "dax device is too small, required at least %llu",
+				PCACHE_CACHE_DEV_SIZE_MIN);
+		return -ENOSPC;
+	}
+
+	nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE));
+
+#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
+	flags |= PCACHE_SB_F_BIGENDIAN;
+#endif
+	sb->flags = cpu_to_le32(flags);
+	sb->magic = cpu_to_le64(PCACHE_MAGIC);
+	sb->seg_num = cpu_to_le32(nr_segs);
+	sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4));
+
+	cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev),
+			     PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX +
+			     PCACHE_CACHE_CTRL_SIZE);
+
+	return 0;
+}
+
+static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
+{
+	struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+	u32 flags;
+	u32 crc;
+
+	if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) {
+		pcache_dev_err(pcache, "unexpected magic: %llx\n",
+				le64_to_cpu(sb->magic));
+		return -EINVAL;
+	}
+
+	crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4);
+	if (crc != le32_to_cpu(sb->crc)) {
+		pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc));
+		return -EINVAL;
+	}
+
+	flags = le32_to_cpu(sb->flags);
+#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
+	if (!(flags & PCACHE_SB_F_BIGENDIAN)) {
+		pcache_dev_err(pcache, "cache_dev is not big endian\n");
+		return -EINVAL;
+	}
+#else
+	if (flags & PCACHE_SB_F_BIGENDIAN) {
+		pcache_dev_err(pcache, "cache_dev is big endian\n");
+		return -EINVAL;
+	}
+#endif
+	return 0;
+}
+
+static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num)
+{
+	cache_dev->seg_num = seg_num;
+	cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
+	if (!cache_dev->seg_bitmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void cache_dev_exit(struct pcache_cache_dev *cache_dev)
+{
+	kvfree(cache_dev->seg_bitmap);
+}
+
+void cache_dev_stop(struct dm_pcache *pcache)
+{
+	struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+
+	cache_dev_exit(cache_dev);
+	cache_dev_dax_exit(cache_dev);
+}
+
+int cache_dev_start(struct dm_pcache *pcache)
+{
+	struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+	struct pcache_sb sb;
+	bool format = false;
+	int ret;
+
+	mutex_init(&cache_dev->seg_lock);
+
+	ret = cache_dev_dax_init(cache_dev);
+	if (ret) {
+		pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.",
+			       cache_dev->dm_dev->name, ret);
+		goto err;
+	}
+
+	ret = sb_read(cache_dev, &sb);
+	if (ret)
+		goto dax_release;
+
+	if (le64_to_cpu(sb.magic) == 0) {
+		format = true;
+		ret = sb_init(cache_dev, &sb);
+		if (ret < 0)
+			goto dax_release;
+	}
+
+	ret = sb_validate(cache_dev, &sb);
+	if (ret)
+		goto dax_release;
+
+	cache_dev->sb_flags = le32_to_cpu(sb.flags);
+	ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num));
+	if (ret)
+		goto dax_release;
+
+	if (format)
+		sb_write(cache_dev, &sb);
+
+	return 0;
+
+dax_release:
+	cache_dev_dax_exit(cache_dev);
+err:
+	return ret;
+}
+
+int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id)
+{
+	int ret;
+
+	mutex_lock(&cache_dev->seg_lock);
+	*seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0);
+	if (*seg_id == cache_dev->seg_num) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	__set_bit(*seg_id, cache_dev->seg_bitmap);
+	ret = 0;
+unlock:
+	mutex_unlock(&cache_dev->seg_lock);
+	return ret;
+}
diff --git a/drivers/md/dm-pcache/cache_dev.h b/drivers/md/dm-pcache/cache_dev.h
new file mode 100644
index 000000000000..6251eb4ebe96
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_dev.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_CACHE_DEV_H
+#define _PCACHE_CACHE_DEV_H
+
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+
+#include "pcache_internal.h"
+
+#define PCACHE_MAGIC				0x65B05EFA96C596EFULL
+
+#define PCACHE_SB_OFF				(4 * PCACHE_KB)
+#define PCACHE_SB_SIZE				(4 * PCACHE_KB)
+
+#define PCACHE_CACHE_INFO_OFF			(PCACHE_SB_OFF + PCACHE_SB_SIZE)
+#define PCACHE_CACHE_INFO_SIZE			(4 * PCACHE_KB)
+
+#define PCACHE_CACHE_CTRL_OFF			(PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX))
+#define PCACHE_CACHE_CTRL_SIZE			(4 * PCACHE_KB)
+
+#define PCACHE_SEGMENTS_OFF			(PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE)
+#define PCACHE_SEG_INFO_SIZE			(4 * PCACHE_KB)
+
+#define PCACHE_CACHE_DEV_SIZE_MIN		(512 * PCACHE_MB)	/* 512 MB */
+#define PCACHE_SEG_SIZE				(16 * PCACHE_MB)	/* Size of each PCACHE segment (16 MB) */
+
+#define CACHE_DEV_SB(cache_dev)			((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF))
+#define CACHE_DEV_CACHE_INFO(cache_dev)		((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF)
+#define CACHE_DEV_CACHE_CTRL(cache_dev)		((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF)
+#define CACHE_DEV_SEGMENTS(cache_dev)		((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF)
+#define CACHE_DEV_SEGMENT(cache_dev, id)	((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE)
+
+/*
+ * PCACHE SB flags configured during formatting
+ *
+ * The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev
+ * formatting. For a machine to register a cache_dev:
+ * - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine.
+ */
+#define PCACHE_SB_F_BIGENDIAN			BIT(0)
+
+struct pcache_sb {
+	__le32 crc;
+	__le32 flags;
+	__le64 magic;
+
+	__le32 seg_num;
+};
+
+struct pcache_cache_dev {
+	u32				sb_flags;
+	u32				seg_num;
+	void				*mapping;
+	bool				use_vmap;
+
+	struct dm_dev			*dm_dev;
+
+	struct mutex			seg_lock;
+	unsigned long			*seg_bitmap;
+};
+
+struct dm_pcache;
+int cache_dev_start(struct dm_pcache *pcache);
+void cache_dev_stop(struct dm_pcache *pcache);
+
+void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size);
+
+int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id);
+
+#endif /* _PCACHE_CACHE_DEV_H */
diff --git a/drivers/md/dm-pcache/cache_gc.c b/drivers/md/dm-pcache/cache_gc.c
new file mode 100644
index 000000000000..94f8b276a021
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_gc.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+/**
+ * cache_key_gc - Releases the reference of a cache key segment.
+ * @cache: Pointer to the pcache_cache structure.
+ * @key: Pointer to the cache key to be garbage collected.
+ *
+ * This function decrements the reference count of the cache segment
+ * associated with the given key. If the reference count drops to zero,
+ * the segment may be invalidated and reused.
+ */
+static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+	cache_seg_put(key->cache_pos.cache_seg);
+}
+
+static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	void *dirty_addr, *key_addr;
+	u32 segs_used, segs_gc_threshold, to_copy;
+	int ret;
+
+	dirty_addr = cache_pos_addr(dirty_tail);
+	key_addr = cache_pos_addr(key_tail);
+	if (dirty_addr == key_addr) {
+		pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n",
+				dirty_tail->cache_seg->cache_seg_id,
+				dirty_tail->seg_off);
+		return false;
+	}
+
+	kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
+
+	to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off);
+	ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy);
+	if (ret) {
+		pcache_dev_err(pcache, "error to read kset: %d", ret);
+		return false;
+	}
+
+	/* Check if kset_onmedia is corrupted */
+	if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
+		pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n",
+					key_tail->cache_seg->cache_seg_id, key_tail->seg_off,
+					kset_onmedia->magic, PCACHE_KSET_MAGIC);
+		return false;
+	}
+
+	/* Verify the CRC of the kset_onmedia */
+	if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+		pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n",
+					cache_kset_crc(kset_onmedia), kset_onmedia->crc);
+		return false;
+	}
+
+	segs_used = bitmap_weight(cache->seg_map, cache->n_segs);
+	segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100;
+	if (segs_used < segs_gc_threshold) {
+		pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * last_kset_gc - Advances the garbage collection for the last kset.
+ * @cache: Pointer to the pcache_cache structure.
+ * @kset_onmedia: Pointer to the kset_onmedia structure for the last kset.
+ */
+static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_segment *cur_seg, *next_seg;
+
+	cur_seg = cache->key_tail.cache_seg;
+
+	next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
+
+	mutex_lock(&cache->key_tail_lock);
+	cache->key_tail.cache_seg = next_seg;
+	cache->key_tail.seg_off = 0;
+	cache_encode_key_tail(cache);
+	mutex_unlock(&cache->key_tail_lock);
+
+	pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id);
+
+	spin_lock(&cache->seg_map_lock);
+	__clear_bit(cur_seg->cache_seg_id, cache->seg_map);
+	spin_unlock(&cache->seg_map_lock);
+}
+
+void pcache_cache_gc_fn(struct work_struct *work)
+{
+	struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work);
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_pos dirty_tail, key_tail;
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	struct pcache_cache_key_onmedia *key_onmedia;
+	struct pcache_cache_key *key;
+	int ret;
+	int i;
+
+	kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
+
+	while (true) {
+		if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors))
+			return;
+
+		/* Get new tail positions */
+		mutex_lock(&cache->dirty_tail_lock);
+		cache_pos_copy(&dirty_tail, &cache->dirty_tail);
+		mutex_unlock(&cache->dirty_tail_lock);
+
+		mutex_lock(&cache->key_tail_lock);
+		cache_pos_copy(&key_tail, &cache->key_tail);
+		mutex_unlock(&cache->key_tail_lock);
+
+		if (!need_gc(cache, &dirty_tail, &key_tail))
+			break;
+
+		if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+			/* Don't move to the next segment if dirty_tail has not moved */
+			if (dirty_tail.cache_seg == key_tail.cache_seg)
+				break;
+
+			last_kset_gc(cache, kset_onmedia);
+			continue;
+		}
+
+		for (i = 0; i < kset_onmedia->key_num; i++) {
+			struct pcache_cache_key key_tmp = { 0 };
+
+			key_onmedia = &kset_onmedia->data[i];
+
+			key = &key_tmp;
+			cache_key_init(&cache->req_key_tree, key);
+
+			ret = cache_key_decode(cache, key_onmedia, key);
+			if (ret) {
+				/* return without re-arm gc work, and prevent future
+				 * gc, because we can't retry the partial-gc-ed kset
+				 */
+				atomic_inc(&cache->gc_errors);
+				pcache_dev_err(pcache, "failed to decode cache key in gc\n");
+				return;
+			}
+
+			cache_key_gc(cache, key);
+		}
+
+		pcache_dev_debug(pcache, "gc advance: %u:%u %u\n",
+			key_tail.cache_seg->cache_seg_id,
+			key_tail.seg_off,
+			get_kset_onmedia_size(kset_onmedia));
+
+		mutex_lock(&cache->key_tail_lock);
+		cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia));
+		cache_encode_key_tail(cache);
+		mutex_unlock(&cache->key_tail_lock);
+	}
+
+	queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL);
+}
diff --git a/drivers/md/dm-pcache/cache_key.c b/drivers/md/dm-pcache/cache_key.c
new file mode 100644
index 000000000000..2b77e121f89b
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_key.c
@@ -0,0 +1,888 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 };
+
+void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key)
+{
+	kref_init(&key->ref);
+	key->cache_tree = cache_tree;
+	INIT_LIST_HEAD(&key->list_node);
+	RB_CLEAR_NODE(&key->rb_node);
+}
+
+struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask)
+{
+	struct pcache_cache_key *key;
+
+	key = mempool_alloc(&cache_tree->key_pool, gfp_mask);
+	if (!key)
+		return NULL;
+
+	memset(key, 0, sizeof(struct pcache_cache_key));
+	cache_key_init(cache_tree, key);
+
+	return key;
+}
+
+/**
+ * cache_key_get - Increment the reference count of a cache key.
+ * @key: Pointer to the pcache_cache_key structure.
+ *
+ * This function increments the reference count of the specified cache key,
+ * ensuring that it is not freed while still in use.
+ */
+void cache_key_get(struct pcache_cache_key *key)
+{
+	kref_get(&key->ref);
+}
+
+/**
+ * cache_key_destroy - Free a cache key structure when its reference count drops to zero.
+ * @ref: Pointer to the kref structure.
+ *
+ * This function is called when the reference count of the cache key reaches zero.
+ * It frees the allocated cache key back to the slab cache.
+ */
+static void cache_key_destroy(struct kref *ref)
+{
+	struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref);
+	struct pcache_cache_tree *cache_tree = key->cache_tree;
+
+	mempool_free(key, &cache_tree->key_pool);
+}
+
+void cache_key_put(struct pcache_cache_key *key)
+{
+	kref_put(&key->ref, cache_key_destroy);
+}
+
+void cache_pos_advance(struct pcache_cache_pos *pos, u32 len)
+{
+	/* Ensure enough space remains in the current segment */
+	BUG_ON(cache_seg_remain(pos) < len);
+
+	pos->seg_off += len;
+}
+
+static void cache_key_encode(struct pcache_cache *cache,
+			     struct pcache_cache_key_onmedia *key_onmedia,
+			     struct pcache_cache_key *key)
+{
+	key_onmedia->off = key->off;
+	key_onmedia->len = key->len;
+
+	key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id;
+	key_onmedia->cache_seg_off = key->cache_pos.seg_off;
+
+	key_onmedia->seg_gen = key->seg_gen;
+	key_onmedia->flags = key->flags;
+
+	if (cache_data_crc_on(cache))
+		key_onmedia->data_crc = cache_key_data_crc(key);
+}
+
+int cache_key_decode(struct pcache_cache *cache,
+			struct pcache_cache_key_onmedia *key_onmedia,
+			struct pcache_cache_key *key)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+
+	key->off = key_onmedia->off;
+	key->len = key_onmedia->len;
+
+	key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id];
+	key->cache_pos.seg_off = key_onmedia->cache_seg_off;
+
+	key->seg_gen = key_onmedia->seg_gen;
+	key->flags = key_onmedia->flags;
+
+	if (cache_data_crc_on(cache) &&
+			key_onmedia->data_crc != cache_key_data_crc(key)) {
+		pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n",
+				key->off, key->len, key->cache_pos.cache_seg->cache_seg_id,
+				key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void append_last_kset(struct pcache_cache *cache, u32 next_seg)
+{
+	struct pcache_cache_kset_onmedia kset_onmedia = { 0 };
+
+	kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST;
+	kset_onmedia.next_cache_seg_id = next_seg;
+	kset_onmedia.magic = PCACHE_KSET_MAGIC;
+	kset_onmedia.crc = cache_kset_crc(&kset_onmedia);
+
+	memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia));
+	pmem_wmb();
+	cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia));
+}
+
+int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset)
+{
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	u32 kset_onmedia_size;
+	int ret;
+
+	kset_onmedia = &kset->kset_onmedia;
+
+	if (!kset_onmedia->key_num)
+		return 0;
+
+	kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num);
+
+	spin_lock(&cache->key_head_lock);
+again:
+	/* Reserve space for the last kset */
+	if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) {
+		struct pcache_cache_segment *next_seg;
+
+		next_seg = get_cache_segment(cache);
+		if (!next_seg) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		/* clear outdated kset in next seg */
+		memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset,
+					sizeof(struct pcache_cache_kset_onmedia));
+		append_last_kset(cache, next_seg->cache_seg_id);
+		cache->key_head.cache_seg = next_seg;
+		cache->key_head.seg_off = 0;
+		goto again;
+	}
+
+	kset_onmedia->magic = PCACHE_KSET_MAGIC;
+	kset_onmedia->crc = cache_kset_crc(kset_onmedia);
+
+	/* clear outdated kset after current kset */
+	memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset,
+				sizeof(struct pcache_cache_kset_onmedia));
+	/* write current kset into segment */
+	memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size);
+	pmem_wmb();
+
+	/* reset kset_onmedia */
+	memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia));
+	cache_pos_advance(&cache->key_head, kset_onmedia_size);
+
+	ret = 0;
+out:
+	spin_unlock(&cache->key_head_lock);
+
+	return ret;
+}
+
+/**
+ * cache_key_append - Append a cache key to the related kset.
+ * @cache: Pointer to the pcache_cache structure.
+ * @key: Pointer to the cache key structure to append.
+ * @force_close: Need to close current kset if true.
+ *
+ * This function appends a cache key to the appropriate kset. If the kset
+ * is full, it closes the kset. If not, it queues a flush work to write
+ * the kset to media.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close)
+{
+	struct pcache_cache_kset *kset;
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	struct pcache_cache_key_onmedia *key_onmedia;
+	u32 kset_id = get_kset_id(cache, key->off);
+	int ret = 0;
+
+	kset = get_kset(cache, kset_id);
+	kset_onmedia = &kset->kset_onmedia;
+
+	spin_lock(&kset->kset_lock);
+	key_onmedia = &kset_onmedia->data[kset_onmedia->key_num];
+	cache_key_encode(cache, key_onmedia, key);
+
+	/* Check if the current kset has reached the maximum number of keys */
+	if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) {
+		/* If full, close the kset */
+		ret = cache_kset_close(cache, kset);
+		if (ret) {
+			kset_onmedia->key_num--;
+			goto out;
+		}
+	} else {
+		/* If not full, queue a delayed work to flush the kset */
+		queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ);
+	}
+out:
+	spin_unlock(&kset->kset_lock);
+
+	return ret;
+}
+
+/**
+ * cache_subtree_walk - Traverse the cache tree.
+ * @ctx: Pointer to the context structure for traversal.
+ *
+ * This function traverses the cache tree starting from the specified node.
+ * It calls the appropriate callback functions based on the relationships
+ * between the keys in the cache tree.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache_key *key_tmp, *key;
+	struct rb_node *node_tmp;
+	int ret = SUBTREE_WALK_RET_OK;
+
+	key = ctx->key;
+	node_tmp = ctx->start_node;
+
+	while (node_tmp) {
+		if (ctx->walk_done && ctx->walk_done(ctx))
+			break;
+
+		key_tmp = CACHE_KEY(node_tmp);
+		/*
+		 * If key_tmp ends before the start of key, continue to the next node.
+		 * |----------|
+		 *              |=====|
+		 */
+		if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) {
+			if (ctx->after) {
+				ret = ctx->after(key, key_tmp, ctx);
+				if (ret)
+					goto out;
+			}
+			goto next;
+		}
+
+		/*
+		 * If key_tmp starts after the end of key, stop traversing.
+		 *	  |--------|
+		 * |====|
+		 */
+		if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) {
+			if (ctx->before) {
+				ret = ctx->before(key, key_tmp, ctx);
+				if (ret)
+					goto out;
+			}
+			break;
+		}
+
+		/* Handle overlapping keys */
+		if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) {
+			/*
+			 * If key_tmp encompasses key.
+			 *     |----------------|	key_tmp
+			 * |===========|		key
+			 */
+			if (cache_key_lend(key_tmp) >= cache_key_lend(key)) {
+				if (ctx->overlap_tail) {
+					ret = ctx->overlap_tail(key, key_tmp, ctx);
+					if (ret)
+						goto out;
+				}
+				break;
+			}
+
+			/*
+			 * If key_tmp is contained within key.
+			 *    |----|		key_tmp
+			 * |==========|		key
+			 */
+			if (ctx->overlap_contain) {
+				ret = ctx->overlap_contain(key, key_tmp, ctx);
+				if (ret)
+					goto out;
+			}
+
+			goto next;
+		}
+
+		/*
+		 * If key_tmp starts before key ends but ends after key.
+		 * |-----------|	key_tmp
+		 *   |====|		key
+		 */
+		if (cache_key_lend(key_tmp) > cache_key_lend(key)) {
+			if (ctx->overlap_contained) {
+				ret = ctx->overlap_contained(key, key_tmp, ctx);
+				if (ret)
+					goto out;
+			}
+			break;
+		}
+
+		/*
+		 * If key_tmp starts before key and ends within key.
+		 * |--------|		key_tmp
+		 *   |==========|	key
+		 */
+		if (ctx->overlap_head) {
+			ret = ctx->overlap_head(key, key_tmp, ctx);
+			if (ret)
+				goto out;
+		}
+next:
+		node_tmp = rb_next(node_tmp);
+	}
+
+out:
+	if (ctx->walk_finally)
+		ret = ctx->walk_finally(ctx, ret);
+
+	return ret;
+}
+
+/**
+ * cache_subtree_search - Search for a key in the cache tree.
+ * @cache_subtree: Pointer to the cache tree structure.
+ * @key: Pointer to the cache key to search for.
+ * @parentp: Pointer to store the parent node of the found node.
+ * @newp: Pointer to store the location where the new node should be inserted.
+ * @delete_key_list: List to collect invalid keys for deletion.
+ *
+ * This function searches the cache tree for a specific key and returns
+ * the node that is the predecessor of the key, or first node if the key is
+ * less than all keys in the tree. If any invalid keys are found during
+ * the search, they are added to the delete_key_list for later cleanup.
+ *
+ * Returns a pointer to the previous node.
+ */
+struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
+				  struct rb_node **parentp, struct rb_node ***newp,
+				  struct list_head *delete_key_list)
+{
+	struct rb_node **new, *parent = NULL;
+	struct pcache_cache_key *key_tmp;
+	struct rb_node *prev_node = NULL;
+
+	new = &(cache_subtree->root.rb_node);
+	while (*new) {
+		key_tmp = container_of(*new, struct pcache_cache_key, rb_node);
+		if (cache_key_invalid(key_tmp))
+			list_add(&key_tmp->list_node, delete_key_list);
+
+		parent = *new;
+		if (key_tmp->off >= key->off) {
+			new = &((*new)->rb_left);
+		} else {
+			prev_node = *new;
+			new = &((*new)->rb_right);
+		}
+	}
+
+	if (!prev_node)
+		prev_node = rb_first(&cache_subtree->root);
+
+	if (parentp)
+		*parentp = parent;
+
+	if (newp)
+		*newp = new;
+
+	return prev_node;
+}
+
+static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache_key *key;
+
+	if (ctx->pre_alloc_key) {
+		key = ctx->pre_alloc_key;
+		ctx->pre_alloc_key = NULL;
+
+		return key;
+	}
+
+	return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT);
+}
+
+/**
+ * fixup_overlap_tail - Adjust the key when it overlaps at the tail.
+ * @key: Pointer to the new cache key being inserted.
+ * @key_tmp: Pointer to the existing key that overlaps.
+ * @ctx: Pointer to the context for walking the cache tree.
+ *
+ * This function modifies the existing key (key_tmp) when there is an
+ * overlap at the tail with the new key. If the modified key becomes
+ * empty, it is deleted.
+ */
+static int fixup_overlap_tail(struct pcache_cache_key *key,
+			       struct pcache_cache_key *key_tmp,
+			       struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	/*
+	 *     |----------------|	key_tmp
+	 * |===========|		key
+	 */
+	BUG_ON(cache_key_empty(key));
+	if (cache_key_empty(key_tmp)) {
+		cache_key_delete(key_tmp);
+		return SUBTREE_WALK_RET_RESEARCH;
+	}
+
+	cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp));
+	if (key_tmp->len == 0) {
+		cache_key_delete(key_tmp);
+		return SUBTREE_WALK_RET_RESEARCH;
+	}
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * fixup_overlap_contain - Handle case where new key completely contains an existing key.
+ * @key: Pointer to the new cache key being inserted.
+ * @key_tmp: Pointer to the existing key that is being contained.
+ * @ctx: Pointer to the context for walking the cache tree.
+ *
+ * This function deletes the existing key (key_tmp) when the new key
+ * completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the
+ * tree structure may have changed, necessitating a re-insertion of
+ * the new key.
+ */
+static int fixup_overlap_contain(struct pcache_cache_key *key,
+				  struct pcache_cache_key *key_tmp,
+				  struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	/*
+	 *    |----|			key_tmp
+	 * |==========|			key
+	 */
+	BUG_ON(cache_key_empty(key));
+	cache_key_delete(key_tmp);
+
+	return SUBTREE_WALK_RET_RESEARCH;
+}
+
+/**
+ * fixup_overlap_contained - Handle overlap when a new key is contained in an existing key.
+ * @key: The new cache key being inserted.
+ * @key_tmp: The existing cache key that overlaps with the new key.
+ * @ctx: Context for the cache tree walk.
+ *
+ * This function adjusts the existing key if the new key is contained
+ * within it. If the existing key is empty, it indicates a placeholder key
+ * that was inserted during a miss read. This placeholder will later be
+ * updated with real data from the backing_dev, making it no longer an empty key.
+ *
+ * If we delete key or insert a key, the structure of the entire cache tree may change,
+ * requiring a full research of the tree to find a new insertion point.
+ */
+static int fixup_overlap_contained(struct pcache_cache_key *key,
+	struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache_tree *cache_tree = ctx->cache_tree;
+
+	/*
+	 * |-----------|		key_tmp
+	 *   |====|			key
+	 */
+	BUG_ON(cache_key_empty(key));
+	if (cache_key_empty(key_tmp)) {
+		/* If key_tmp is empty, don't split it;
+		 * it's a placeholder key for miss reads that will be updated later.
+		 */
+		cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+		if (key_tmp->len == 0) {
+			cache_key_delete(key_tmp);
+			return SUBTREE_WALK_RET_RESEARCH;
+		}
+	} else {
+		struct pcache_cache_key *key_fixup;
+		bool need_research = false;
+
+		key_fixup = get_pre_alloc_key(ctx);
+		if (!key_fixup)
+			return SUBTREE_WALK_RET_NEED_KEY;
+
+		cache_key_copy(key_fixup, key_tmp);
+
+		/* Split key_tmp based on the new key's range */
+		cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+		if (key_tmp->len == 0) {
+			cache_key_delete(key_tmp);
+			need_research = true;
+		}
+
+		/* Create a new portion for key_fixup */
+		cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp));
+		if (key_fixup->len == 0) {
+			cache_key_put(key_fixup);
+		} else {
+			/* Insert the new key into the cache */
+			cache_key_insert(cache_tree, key_fixup, false);
+			need_research = true;
+		}
+
+		if (need_research)
+			return SUBTREE_WALK_RET_RESEARCH;
+	}
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key.
+ * @key: The new cache key being inserted.
+ * @key_tmp: The existing cache key that overlaps with the new key.
+ * @ctx: Context for the cache tree walk.
+ *
+ * This function adjusts the existing key if the new key overlaps
+ * with the beginning of it. If the resulting key length is zero
+ * after the adjustment, the key is deleted. This indicates that
+ * the key no longer holds valid data and requires the tree to be
+ * re-researched for a new insertion point.
+ */
+static int fixup_overlap_head(struct pcache_cache_key *key,
+	struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	/*
+	 * |--------|		key_tmp
+	 *   |==========|	key
+	 */
+	BUG_ON(cache_key_empty(key));
+	/* Adjust key_tmp by cutting back based on the new key's start */
+	cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
+	if (key_tmp->len == 0) {
+		/* If the adjusted key_tmp length is zero, delete it */
+		cache_key_delete(key_tmp);
+		return SUBTREE_WALK_RET_RESEARCH;
+	}
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * cache_key_insert - Insert a new cache key into the cache tree.
+ * @cache_tree: Pointer to the cache_tree structure.
+ * @key: The cache key to insert.
+ * @fixup: Indicates if this is a new key being inserted.
+ *
+ * This function searches for the appropriate location to insert
+ * a new cache key into the cache tree. It handles key overlaps
+ * and ensures any invalid keys are removed before insertion.
+ */
+void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup)
+{
+	struct pcache_cache *cache = cache_tree->cache;
+	struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
+	struct rb_node **new, *parent = NULL;
+	struct pcache_cache_subtree *cache_subtree;
+	struct pcache_cache_key *key_tmp = NULL, *key_next;
+	struct rb_node *prev_node = NULL;
+	LIST_HEAD(delete_key_list);
+	int ret;
+
+	cache_subtree = get_subtree(cache_tree, key->off);
+	key->cache_subtree = cache_subtree;
+search:
+	prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list);
+	if (!list_empty(&delete_key_list)) {
+		/* Remove invalid keys from the delete list */
+		list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
+			list_del_init(&key_tmp->list_node);
+			cache_key_delete(key_tmp);
+		}
+		goto search;
+	}
+
+	if (fixup) {
+		/* Set up the context with the cache, start node, and new key */
+		walk_ctx.cache_tree = cache_tree;
+		walk_ctx.start_node = prev_node;
+		walk_ctx.key = key;
+
+		/* Assign overlap handling functions for different scenarios */
+		walk_ctx.overlap_tail = fixup_overlap_tail;
+		walk_ctx.overlap_head = fixup_overlap_head;
+		walk_ctx.overlap_contain = fixup_overlap_contain;
+		walk_ctx.overlap_contained = fixup_overlap_contained;
+
+		ret = cache_subtree_walk(&walk_ctx);
+		switch (ret) {
+		case SUBTREE_WALK_RET_OK:
+			break;
+		case SUBTREE_WALK_RET_RESEARCH:
+			goto search;
+		case SUBTREE_WALK_RET_NEED_KEY:
+			spin_unlock(&cache_subtree->tree_lock);
+			pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO");
+			walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO);
+			spin_lock(&cache_subtree->tree_lock);
+			goto search;
+		default:
+			BUG();
+		}
+	}
+
+	if (walk_ctx.pre_alloc_key)
+		cache_key_put(walk_ctx.pre_alloc_key);
+
+	/* Link and insert the new key into the red-black tree */
+	rb_link_node(&key->rb_node, parent, new);
+	rb_insert_color(&key->rb_node, &cache_subtree->root);
+}
+
+/**
+ * clean_fn - Cleanup function to remove invalid keys from the cache tree.
+ * @work: Pointer to the work_struct associated with the cleanup.
+ *
+ * This function cleans up invalid keys from the cache tree in the background
+ * after a cache segment has been invalidated during cache garbage collection.
+ * It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds
+ * the tree lock to ensure thread safety.
+ */
+void clean_fn(struct work_struct *work)
+{
+	struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work);
+	struct pcache_cache_subtree *cache_subtree;
+	struct rb_node *node;
+	struct pcache_cache_key *key;
+	int i, count;
+
+	for (i = 0; i < cache->req_key_tree.n_subtrees; i++) {
+		cache_subtree = &cache->req_key_tree.subtrees[i];
+
+again:
+		if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
+			return;
+
+		/* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */
+		count = 0;
+		spin_lock(&cache_subtree->tree_lock);
+		node = rb_first(&cache_subtree->root);
+		while (node) {
+			key = CACHE_KEY(node);
+			node = rb_next(node);
+			if (cache_key_invalid(key)) {
+				count++;
+				cache_key_delete(key);
+			}
+
+			if (count >= PCACHE_CLEAN_KEYS_MAX) {
+				/* Unlock and pause before continuing cleanup */
+				spin_unlock(&cache_subtree->tree_lock);
+				usleep_range(1000, 2000);
+				goto again;
+			}
+		}
+		spin_unlock(&cache_subtree->tree_lock);
+	}
+}
+
+/*
+ * kset_flush_fn - Flush work for a cache kset.
+ *
+ * This function is called when a kset flush work is queued from
+ * cache_key_append(). If the kset is full, it will be closed
+ * immediately. If not, the flush work will be queued for later closure.
+ *
+ * If cache_kset_close detects that a new segment is required to store
+ * the kset and there are no available segments, it will return an error.
+ * In this scenario, a retry will be attempted.
+ */
+void kset_flush_fn(struct work_struct *work)
+{
+	struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work);
+	struct pcache_cache *cache = kset->cache;
+	int ret;
+
+	if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
+		return;
+
+	spin_lock(&kset->kset_lock);
+	ret = cache_kset_close(cache, kset);
+	spin_unlock(&kset->kset_lock);
+
+	if (ret) {
+		/* Failed to flush kset, schedule a retry. */
+		queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100));
+	}
+}
+
+static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+	struct pcache_cache_key_onmedia *key_onmedia;
+	struct pcache_cache_subtree *cache_subtree;
+	struct pcache_cache_key *key;
+	int ret;
+	int i;
+
+	for (i = 0; i < kset_onmedia->key_num; i++) {
+		key_onmedia = &kset_onmedia->data[i];
+
+		key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
+		ret = cache_key_decode(cache, key_onmedia, key);
+		if (ret) {
+			cache_key_put(key);
+			goto err;
+		}
+
+		__set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map);
+
+		/* Check if the segment generation is valid for insertion. */
+		if (key->seg_gen < key->cache_pos.cache_seg->gen) {
+			cache_key_put(key);
+		} else {
+			cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+			spin_lock(&cache_subtree->tree_lock);
+			cache_key_insert(&cache->req_key_tree, key, true);
+			spin_unlock(&cache_subtree->tree_lock);
+		}
+
+		cache_seg_get(key->cache_pos.cache_seg);
+	}
+
+	return 0;
+err:
+	return ret;
+}
+
+int cache_replay(struct pcache_cache *cache)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_pos pos_tail;
+	struct pcache_cache_pos *pos;
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	u32 to_copy, count = 0;
+	int ret = 0;
+
+	kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL);
+	if (!kset_onmedia)
+		return -ENOMEM;
+
+	cache_pos_copy(&pos_tail, &cache->key_tail);
+	pos = &pos_tail;
+
+	/*
+	 * In cache replaying stage, there is no other one will access
+	 * cache->seg_map, so we can set bit here without cache->seg_map_lock.
+	 */
+	__set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
+
+	while (true) {
+		to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off);
+		ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy);
+		if (ret) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (kset_onmedia->magic != PCACHE_KSET_MAGIC ||
+				kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+			break;
+		}
+
+		/* Process the last kset and prepare for the next segment. */
+		if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+			struct pcache_cache_segment *next_seg;
+
+			pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id);
+
+			next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
+
+			pos->cache_seg = next_seg;
+			pos->seg_off = 0;
+
+			__set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
+			continue;
+		}
+
+		/* Replay the kset and check for errors. */
+		ret = kset_replay(cache, kset_onmedia);
+		if (ret)
+			goto out;
+
+		/* Advance the position after processing the kset. */
+		cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia));
+		if (++count > 512) {
+			cond_resched();
+			count = 0;
+		}
+	}
+
+	/* Update the key_head position after replaying. */
+	spin_lock(&cache->key_head_lock);
+	cache_pos_copy(&cache->key_head, pos);
+	spin_unlock(&cache->key_head_lock);
+out:
+	kfree(kset_onmedia);
+	return ret;
+}
+
+int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees)
+{
+	int ret;
+	u32 i;
+
+	cache_tree->cache = cache;
+	cache_tree->n_subtrees = n_subtrees;
+
+	ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache);
+	if (ret)
+		goto err;
+
+	/*
+	 * Allocate and initialize the subtrees array.
+	 * Each element is a cache tree structure that contains
+	 * an RB tree root and a spinlock for protecting its contents.
+	 */
+	cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL);
+	if (!cache_tree->subtrees) {
+		ret = -ENOMEM;
+		goto key_pool_exit;
+	}
+
+	for (i = 0; i < cache_tree->n_subtrees; i++) {
+		struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i];
+
+		cache_subtree->root = RB_ROOT;
+		spin_lock_init(&cache_subtree->tree_lock);
+	}
+
+	return 0;
+
+key_pool_exit:
+	mempool_exit(&cache_tree->key_pool);
+err:
+	return ret;
+}
+
+void cache_tree_clear(struct pcache_cache_tree *cache_tree)
+{
+	struct pcache_cache_subtree *cache_subtree;
+	struct rb_node *node;
+	struct pcache_cache_key *key;
+	u32 i;
+
+	for (i = 0; i < cache_tree->n_subtrees; i++) {
+		cache_subtree = &cache_tree->subtrees[i];
+
+		spin_lock(&cache_subtree->tree_lock);
+		node = rb_first(&cache_subtree->root);
+		while (node) {
+			key = CACHE_KEY(node);
+			node = rb_next(node);
+
+			cache_key_delete(key);
+		}
+		spin_unlock(&cache_subtree->tree_lock);
+	}
+}
+
+void cache_tree_exit(struct pcache_cache_tree *cache_tree)
+{
+	cache_tree_clear(cache_tree);
+	kvfree(cache_tree->subtrees);
+	mempool_exit(&cache_tree->key_pool);
+}
diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c
new file mode 100644
index 000000000000..27f94c1fa968
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_req.c
@@ -0,0 +1,836 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+static int cache_data_head_init(struct pcache_cache *cache)
+{
+	struct pcache_cache_segment *next_seg;
+	struct pcache_cache_data_head *data_head;
+
+	data_head = get_data_head(cache);
+	next_seg = get_cache_segment(cache);
+	if (!next_seg)
+		return -EBUSY;
+
+	cache_seg_get(next_seg);
+	data_head->head_pos.cache_seg = next_seg;
+	data_head->head_pos.seg_off = 0;
+
+	return 0;
+}
+
+/**
+ * cache_data_alloc - Allocate data for a cache key.
+ * @cache: Pointer to the cache structure.
+ * @key: Pointer to the cache key to allocate data for.
+ *
+ * This function tries to allocate space from the cache segment specified by the
+ * data head. If the remaining space in the segment is insufficient to allocate
+ * the requested length for the cache key, it will allocate whatever is available
+ * and adjust the key's length accordingly. This function does not allocate
+ * space that crosses segment boundaries.
+ */
+static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+	struct pcache_cache_data_head *data_head;
+	struct pcache_cache_pos *head_pos;
+	struct pcache_cache_segment *cache_seg;
+	u32 seg_remain;
+	u32 allocated = 0, to_alloc;
+	int ret = 0;
+
+	preempt_disable();
+	data_head = get_data_head(cache);
+again:
+	to_alloc = key->len - allocated;
+	if (!data_head->head_pos.cache_seg) {
+		seg_remain = 0;
+	} else {
+		cache_pos_copy(&key->cache_pos, &data_head->head_pos);
+		key->seg_gen = key->cache_pos.cache_seg->gen;
+
+		head_pos = &data_head->head_pos;
+		cache_seg = head_pos->cache_seg;
+		seg_remain = cache_seg_remain(head_pos);
+	}
+
+	if (seg_remain > to_alloc) {
+		/* If remaining space in segment is sufficient for the cache key, allocate it. */
+		cache_pos_advance(head_pos, to_alloc);
+		allocated += to_alloc;
+		cache_seg_get(cache_seg);
+	} else if (seg_remain) {
+		/* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */
+		cache_pos_advance(head_pos, seg_remain);
+		key->len = seg_remain;
+
+		/* Get for key: obtain a reference to the cache segment for the key. */
+		cache_seg_get(cache_seg);
+		/* Put for head_pos->cache_seg: release the reference for the current head's segment. */
+		cache_seg_put(head_pos->cache_seg);
+		head_pos->cache_seg = NULL;
+	} else {
+		/* Initialize a new data head if no segment is available. */
+		ret = cache_data_head_init(cache);
+		if (ret)
+			goto out;
+
+		goto again;
+	}
+
+out:
+	preempt_enable();
+
+	return ret;
+}
+
+static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key,
+				struct pcache_request *pcache_req, u32 bio_off)
+{
+	struct pcache_cache_pos *pos = &key->cache_pos;
+	struct pcache_segment *segment;
+
+	segment = &pos->cache_seg->segment;
+
+	return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off);
+}
+
+static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req,
+			    u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen)
+{
+	struct pcache_cache_segment *cache_seg = pos->cache_seg;
+	struct pcache_segment *segment = &cache_seg->segment;
+	int ret;
+
+	spin_lock(&cache_seg->gen_lock);
+	if (key_gen < cache_seg->gen) {
+		spin_unlock(&cache_seg->gen_lock);
+		return -EINVAL;
+	}
+
+	ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off);
+	spin_unlock(&cache_seg->gen_lock);
+
+	return ret;
+}
+
+/**
+ * miss_read_end_req - Handle the end of a miss read request.
+ * @backing_req: Pointer to the request structure.
+ * @read_ret: Return value of read.
+ *
+ * This function is called when a backing request to read data from
+ * the backing_dev is completed. If the key associated with the request
+ * is empty (a placeholder), it allocates cache space for the key,
+ * copies the data read from the bio into the cache, and updates
+ * the key's status. If the key has been overwritten by a write
+ * request during this process, it will be deleted from the cache
+ * tree and no further action will be taken.
+ */
+static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret)
+{
+	void *priv_data = backing_req->priv_data;
+	struct pcache_request *pcache_req = backing_req->req.upper_req;
+	struct pcache_cache *cache = backing_req->backing_dev->cache;
+	int ret;
+
+	if (priv_data) {
+		struct pcache_cache_key *key;
+		struct pcache_cache_subtree *cache_subtree;
+
+		key = (struct pcache_cache_key *)priv_data;
+		cache_subtree = key->cache_subtree;
+
+		/* if this key was deleted from cache_subtree by a write, key->flags should be cleared,
+		 * so if cache_key_empty() return true, this key is still in cache_subtree
+		 */
+		spin_lock(&cache_subtree->tree_lock);
+		if (cache_key_empty(key)) {
+			/* Check if the backing request was successful. */
+			if (read_ret) {
+				cache_key_delete(key);
+				goto unlock;
+			}
+
+			/* Allocate cache space for the key and copy data from the backing_dev. */
+			ret = cache_data_alloc(cache, key);
+			if (ret) {
+				cache_key_delete(key);
+				goto unlock;
+			}
+
+			ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off);
+			if (ret) {
+				cache_seg_put(key->cache_pos.cache_seg);
+				cache_key_delete(key);
+				goto unlock;
+			}
+			key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY;
+			key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN;
+
+			/* Append the key to the cache. */
+			ret = cache_key_append(cache, key, false);
+			if (ret) {
+				cache_seg_put(key->cache_pos.cache_seg);
+				cache_key_delete(key);
+				goto unlock;
+			}
+		}
+unlock:
+		spin_unlock(&cache_subtree->tree_lock);
+		cache_key_put(key);
+	}
+}
+
+/**
+ * submit_cache_miss_req - Submit a backing request when cache data is missing
+ * @cache: The cache context that manages cache operations
+ * @backing_req: The cache request containing information about the read request
+ *
+ * This function is used to handle cases where a cache read request cannot locate
+ * the required data in the cache. When such a miss occurs during `cache_subtree_walk`,
+ * it triggers a backing read request to fetch data from the backing storage.
+ *
+ * If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing
+ * a new cache key to be inserted into the cache. The function calls `cache_key_insert`
+ * to attempt adding the key. On insertion failure, it releases the key reference and
+ * clears `priv_data` to avoid further processing.
+ */
+static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req)
+{
+	if (backing_req->priv_data) {
+		struct pcache_cache_key *key;
+
+		/* Attempt to insert the key into the cache if priv_data is set */
+		key = (struct pcache_cache_key *)backing_req->priv_data;
+		cache_key_insert(&cache->req_key_tree, key, true);
+	}
+	backing_dev_req_submit(backing_req, false);
+}
+
+static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req)
+{
+	struct pcache_cache_key *key;
+
+	if (backing_req->priv_data) {
+		key = backing_req->priv_data;
+		backing_req->priv_data = NULL;
+		cache_key_put(key); /* for ->priv_data */
+		cache_key_put(key); /* for init ref in alloc */
+	}
+
+	backing_dev_req_end(backing_req);
+}
+
+static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache,
+							   struct pcache_request *parent,
+							   gfp_t gfp_mask)
+{
+	struct pcache_backing_dev *backing_dev = cache->backing_dev;
+	struct pcache_backing_dev_req *backing_req;
+	struct pcache_cache_key *key = NULL;
+	struct pcache_backing_dev_req_opts req_opts = { 0 };
+
+	req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
+	req_opts.gfp_mask = gfp_mask;
+	req_opts.req.upper_req = parent;
+
+	backing_req = backing_dev_req_alloc(backing_dev, &req_opts);
+	if (!backing_req)
+		return NULL;
+
+	key = cache_key_alloc(&cache->req_key_tree, gfp_mask);
+	if (!key)
+		goto free_backing_req;
+
+	cache_key_get(key);
+	backing_req->priv_data = key;
+
+	return backing_req;
+
+free_backing_req:
+	cache_miss_req_free(backing_req);
+	return NULL;
+}
+
+static void cache_miss_req_init(struct pcache_cache *cache,
+				struct pcache_backing_dev_req *backing_req,
+				struct pcache_request *parent,
+				u32 off, u32 len, bool insert_key)
+{
+	struct pcache_cache_key *key;
+	struct pcache_backing_dev_req_opts req_opts = { 0 };
+
+	req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
+	req_opts.req.upper_req = parent;
+	req_opts.req.req_off = off;
+	req_opts.req.len = len;
+	req_opts.end_fn = miss_read_end_req;
+
+	backing_dev_req_init(backing_req, &req_opts);
+
+	if (insert_key) {
+		key = backing_req->priv_data;
+		key->off = parent->off + off;
+		key->len = len;
+		key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY;
+	} else {
+		key = backing_req->priv_data;
+		backing_req->priv_data = NULL;
+		cache_key_put(key);
+		cache_key_put(key);
+	}
+}
+
+static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_request *pcache_req = ctx->pcache_req;
+	struct pcache_backing_dev_req *backing_req;
+
+	if (ctx->pre_alloc_req) {
+		backing_req = ctx->pre_alloc_req;
+		ctx->pre_alloc_req = NULL;
+
+		return backing_req;
+	}
+
+	return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT);
+}
+
+/*
+ * In the process of walking the cache tree to locate cached data, this
+ * function handles the situation where the requested data range lies
+ * entirely before an existing cache node (`key_tmp`). This outcome
+ * signifies that the target data is absent from the cache (cache miss).
+ *
+ * To fulfill this portion of the read request, the function creates a
+ * backing request (`backing_req`) for the missing data range represented
+ * by `key`. It then appends this request to the submission list in the
+ * `ctx`, which will later be processed to retrieve the data from backing
+ * storage. After setting up the backing request, `req_done` in `ctx` is
+ * updated to reflect the length of the handled range, and the range
+ * in `key` is adjusted by trimming off the portion that is now handled.
+ *
+ * The scenario handled here:
+ *
+ *	  |--------|			  key_tmp (existing cached range)
+ * |====|					   key (requested range, preceding key_tmp)
+ *
+ * Since `key` is before `key_tmp`, it signifies that the requested data
+ * range is missing in the cache (cache miss) and needs retrieval from
+ * backing storage.
+ */
+static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+		struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_backing_dev_req *backing_req;
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+
+	/*
+	 * In this scenario, `key` represents a range that precedes `key_tmp`,
+	 * meaning the requested data range is missing from the cache tree
+	 * and must be retrieved from the backing_dev.
+	 */
+	backing_req = get_pre_alloc_req(ctx);
+	if (!backing_req)
+		return SUBTREE_WALK_RET_NEED_REQ;
+
+	cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
+
+	list_add(&backing_req->node, ctx->submit_req_list);
+	ctx->req_done += key->len;
+	cache_key_cutfront(key, key->len);
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * During cache_subtree_walk, this function manages a scenario where part of the
+ * requested data range overlaps with an existing cache node (`key_tmp`).
+ *
+ *	 |----------------|  key_tmp (existing cached range)
+ * |===========|		   key (requested range, overlapping the tail of key_tmp)
+ */
+static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+		struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_backing_dev_req *backing_req;
+	u32 io_len;
+	int ret;
+
+	/*
+	 * Calculate the length of the non-overlapping portion of `key`
+	 * before `key_tmp`, representing the data missing in the cache.
+	 */
+	io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
+	if (io_len) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
+
+		list_add(&backing_req->node, ctx->submit_req_list);
+		ctx->req_done += io_len;
+		cache_key_cutfront(key, io_len);
+	}
+
+	/*
+	 * Handle the overlapping portion by calculating the length of
+	 * the remaining data in `key` that coincides with `key_tmp`.
+	 */
+	io_len = cache_key_lend(key) - cache_key_lstart(key_tmp);
+	if (cache_key_empty(key_tmp)) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+		submit_cache_miss_req(cache, backing_req);
+	} else {
+		ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+					io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
+		if (ret) {
+			if (ret == -EINVAL) {
+				cache_key_delete(key_tmp);
+				return SUBTREE_WALK_RET_RESEARCH;
+			}
+
+			ctx->ret = ret;
+			return SUBTREE_WALK_RET_ERR;
+		}
+	}
+
+	ctx->req_done += io_len;
+	cache_key_cutfront(key, io_len);
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ *    |----|          key_tmp (existing cached range)
+ * |==========|       key (requested range)
+ */
+static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+		struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_backing_dev_req *backing_req;
+	u32 io_len;
+	int ret;
+
+	/*
+	 * Calculate the non-overlapping part of `key` before `key_tmp`
+	 * to identify the missing data length.
+	 */
+	io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
+	if (io_len) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
+
+		list_add(&backing_req->node, ctx->submit_req_list);
+
+		ctx->req_done += io_len;
+		cache_key_cutfront(key, io_len);
+	}
+
+	/*
+	 * Handle the overlapping portion between `key` and `key_tmp`.
+	 */
+	io_len = key_tmp->len;
+	if (cache_key_empty(key_tmp)) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+		submit_cache_miss_req(cache, backing_req);
+	} else {
+		ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+					io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
+		if (ret) {
+			if (ret == -EINVAL) {
+				cache_key_delete(key_tmp);
+				return SUBTREE_WALK_RET_RESEARCH;
+			}
+
+			ctx->ret = ret;
+			return SUBTREE_WALK_RET_ERR;
+		}
+	}
+
+	ctx->req_done += io_len;
+	cache_key_cutfront(key, io_len);
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ *	 |-----------|		key_tmp (existing cached range)
+ *	   |====|			key (requested range, fully within key_tmp)
+ *
+ * If `key_tmp` contains valid cached data, this function copies the relevant
+ * portion to the request's bio. Otherwise, it sends a backing request to
+ * fetch the required data range.
+ */
+static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+		struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_backing_dev_req *backing_req;
+	struct pcache_cache_pos pos;
+	int ret;
+
+	/*
+	 * Check if `key_tmp` is empty, indicating a miss. If so, initiate
+	 * a backing request to fetch the required data for `key`.
+	 */
+	if (cache_key_empty(key_tmp)) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false);
+		submit_cache_miss_req(cache, backing_req);
+	} else {
+		cache_pos_copy(&pos, &key_tmp->cache_pos);
+		cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
+
+		ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+					key->len, &pos, key_tmp->seg_gen);
+		if (ret) {
+			if (ret == -EINVAL) {
+				cache_key_delete(key_tmp);
+				return SUBTREE_WALK_RET_RESEARCH;
+			}
+
+			ctx->ret = ret;
+			return SUBTREE_WALK_RET_ERR;
+		}
+	}
+
+	ctx->req_done += key->len;
+	cache_key_cutfront(key, key->len);
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ *	 |--------|		  key_tmp (existing cached range)
+ *	   |==========|	  key (requested range, overlapping the head of key_tmp)
+ */
+static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
+		struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_backing_dev_req *backing_req;
+	struct pcache_cache_pos pos;
+	u32 io_len;
+	int ret;
+
+	io_len = cache_key_lend(key_tmp) - cache_key_lstart(key);
+
+	if (cache_key_empty(key_tmp)) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
+		submit_cache_miss_req(cache, backing_req);
+	} else {
+		cache_pos_copy(&pos, &key_tmp->cache_pos);
+		cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
+
+		ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
+					io_len, &pos, key_tmp->seg_gen);
+		if (ret) {
+			if (ret == -EINVAL) {
+				cache_key_delete(key_tmp);
+				return SUBTREE_WALK_RET_RESEARCH;
+			}
+
+			ctx->ret = ret;
+			return SUBTREE_WALK_RET_ERR;
+		}
+	}
+
+	ctx->req_done += io_len;
+	cache_key_cutfront(key, io_len);
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/**
+ * read_walk_finally - Finalizes the cache read tree walk by submitting any
+ *					 remaining backing requests
+ * @ctx:	Context structure holding information about the cache,
+ *		read request, and submission list
+ * @ret:	the return value after this walk.
+ *
+ * This function is called at the end of the `cache_subtree_walk` during a
+ * cache read operation. It completes the walk by checking if any data
+ * requested by `key` was not found in the cache tree, and if so, it sends
+ * a backing request to retrieve that data. Then, it iterates through the
+ * submission list of backing requests created during the walk, removing
+ * each request from the list and submitting it.
+ *
+ * The scenario managed here includes:
+ * - Sending a backing request for the remaining length of `key` if it was
+ *   not fulfilled by existing cache entries.
+ * - Iterating through `ctx->submit_req_list` to submit each backing request
+ *   enqueued during the walk.
+ *
+ * This ensures all necessary backing requests for cache misses are submitted
+ * to the backing storage to retrieve any data that could not be found in
+ * the cache.
+ */
+static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret)
+{
+	struct pcache_cache *cache = ctx->cache_tree->cache;
+	struct pcache_backing_dev_req *backing_req, *next_req;
+	struct pcache_cache_key *key = ctx->key;
+
+	list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) {
+		list_del_init(&backing_req->node);
+		submit_cache_miss_req(ctx->cache_tree->cache, backing_req);
+	}
+
+	if (ret != SUBTREE_WALK_RET_OK)
+		return ret;
+
+	if (key->len) {
+		backing_req = get_pre_alloc_req(ctx);
+		if (!backing_req)
+			return SUBTREE_WALK_RET_NEED_REQ;
+
+		cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
+		submit_cache_miss_req(cache, backing_req);
+		ctx->req_done += key->len;
+	}
+
+	return SUBTREE_WALK_RET_OK;
+}
+
+/*
+ * This function is used within `cache_subtree_walk` to determine whether the
+ * read operation has covered the requested data length. It compares the
+ * amount of data processed (`ctx->req_done`) with the total data length
+ * specified in the original request (`ctx->pcache_req->data_len`).
+ *
+ * If `req_done` meets or exceeds the required data length, the function
+ * returns `true`, indicating the walk is complete. Otherwise, it returns `false`,
+ * signaling that additional data processing is needed to fulfill the request.
+ */
+static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx)
+{
+	return (ctx->req_done >= ctx->pcache_req->data_len);
+}
+
+/**
+ * cache_read - Process a read request by traversing the cache tree
+ * @cache:	 Cache structure holding cache trees and related configurations
+ * @pcache_req:   Request structure with information about the data to read
+ *
+ * This function attempts to fulfill a read request by traversing the cache tree(s)
+ * to locate cached data for the requested range. If parts of the data are missing
+ * in the cache, backing requests are generated to retrieve the required segments.
+ *
+ * The function operates by initializing a key for the requested data range and
+ * preparing a context (`walk_ctx`) to manage the cache tree traversal. The context
+ * includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle
+ * specific conditions encountered during the traversal. The `walk_finally` and `walk_done`
+ * functions manage the end stages of the traversal, while the `delete_key_list` and
+ * `submit_req_list` lists track any keys to be deleted or requests to be submitted.
+ *
+ * The function first calculates the requested range and checks if it fits within the
+ * current cache tree (based on the tree's size limits). It then locks the cache tree
+ * and performs a search to locate any matching keys. If there are outdated keys,
+ * these are deleted, and the search is restarted to ensure accurate data retrieval.
+ *
+ * If the requested range spans multiple cache trees, the function moves on to the
+ * next tree once the current range has been processed. This continues until the
+ * entire requested data length has been handled.
+ */
+static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+	struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len };
+	struct pcache_cache_subtree *cache_subtree;
+	struct pcache_cache_key *key_tmp = NULL, *key_next;
+	struct rb_node *prev_node = NULL;
+	struct pcache_cache_key *key = &key_data;
+	struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
+	struct pcache_backing_dev_req *backing_req, *next_req;
+	LIST_HEAD(delete_key_list);
+	LIST_HEAD(submit_req_list);
+	int ret;
+
+	walk_ctx.cache_tree = &cache->req_key_tree;
+	walk_ctx.req_done = 0;
+	walk_ctx.pcache_req = pcache_req;
+	walk_ctx.before = read_before;
+	walk_ctx.overlap_tail = read_overlap_tail;
+	walk_ctx.overlap_head = read_overlap_head;
+	walk_ctx.overlap_contain = read_overlap_contain;
+	walk_ctx.overlap_contained = read_overlap_contained;
+	walk_ctx.walk_finally = read_walk_finally;
+	walk_ctx.walk_done = read_walk_done;
+	walk_ctx.delete_key_list = &delete_key_list;
+	walk_ctx.submit_req_list = &submit_req_list;
+
+next:
+	key->off = pcache_req->off + walk_ctx.req_done;
+	key->len = pcache_req->data_len - walk_ctx.req_done;
+	if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
+		key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
+
+	cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+	spin_lock(&cache_subtree->tree_lock);
+search:
+	prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list);
+	if (!list_empty(&delete_key_list)) {
+		list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
+			list_del_init(&key_tmp->list_node);
+			cache_key_delete(key_tmp);
+		}
+		goto search;
+	}
+
+	walk_ctx.start_node = prev_node;
+	walk_ctx.key = key;
+
+	ret = cache_subtree_walk(&walk_ctx);
+	if (ret == SUBTREE_WALK_RET_RESEARCH)
+		goto search;
+	spin_unlock(&cache_subtree->tree_lock);
+
+	if (ret == SUBTREE_WALK_RET_ERR) {
+		ret = walk_ctx.ret;
+		goto out;
+	}
+
+	if (ret == SUBTREE_WALK_RET_NEED_REQ) {
+		walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO);
+		pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO");
+	}
+
+	if (walk_ctx.req_done < pcache_req->data_len)
+		goto next;
+	ret = 0;
+out:
+	if (walk_ctx.pre_alloc_req)
+		cache_miss_req_free(walk_ctx.pre_alloc_req);
+
+	list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) {
+		list_del_init(&backing_req->node);
+		backing_dev_req_end(backing_req);
+	}
+
+	return ret;
+}
+
+static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+	struct pcache_cache_subtree *cache_subtree;
+	struct pcache_cache_key *key;
+	u64 offset = pcache_req->off;
+	u32 length = pcache_req->data_len;
+	u32 io_done = 0;
+	int ret;
+
+	while (true) {
+		if (io_done >= length)
+			break;
+
+		key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
+		key->off = offset + io_done;
+		key->len = length - io_done;
+		if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
+			key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
+
+		ret = cache_data_alloc(cache, key);
+		if (ret) {
+			cache_key_put(key);
+			goto err;
+		}
+
+		ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done);
+		if (ret) {
+			cache_seg_put(key->cache_pos.cache_seg);
+			cache_key_put(key);
+			goto err;
+		}
+
+		cache_subtree = get_subtree(&cache->req_key_tree, key->off);
+		spin_lock(&cache_subtree->tree_lock);
+		cache_key_insert(&cache->req_key_tree, key, true);
+		ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA);
+		if (ret) {
+			cache_seg_put(key->cache_pos.cache_seg);
+			cache_key_delete(key);
+			goto unlock;
+		}
+
+		io_done += key->len;
+		spin_unlock(&cache_subtree->tree_lock);
+	}
+
+	return 0;
+unlock:
+	spin_unlock(&cache_subtree->tree_lock);
+err:
+	return ret;
+}
+
+/**
+ * cache_flush - Flush all ksets to persist any pending cache data
+ * @cache: Pointer to the cache structure
+ *
+ * This function iterates through all ksets associated with the provided `cache`
+ * and ensures that any data marked for persistence is written to media. For each
+ * kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles
+ * the persistence logic for that kset.
+ *
+ * If `cache_kset_close` encounters an error, the function exits immediately with
+ * the respective error code, preventing the flush operation from proceeding to
+ * subsequent ksets.
+ */
+int cache_flush(struct pcache_cache *cache)
+{
+	struct pcache_cache_kset *kset;
+	int ret;
+	u32 i;
+
+	for (i = 0; i < cache->n_ksets; i++) {
+		kset = get_kset(cache, i);
+
+		spin_lock(&kset->kset_lock);
+		ret = cache_kset_close(cache, kset);
+		spin_unlock(&kset->kset_lock);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req)
+{
+	struct bio *bio = pcache_req->bio;
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
+		return cache_flush(cache);
+
+	if (bio_data_dir(bio) == READ)
+		return cache_read(cache, pcache_req);
+
+	return cache_write(cache, pcache_req);
+}
diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c
new file mode 100644
index 000000000000..f0b58980806e
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_segment.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "cache_dev.h"
+#include "cache.h"
+#include "backing_dev.h"
+#include "dm_pcache.h"
+
+static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_segment_info *seg_info_addr;
+	u32 seg_id = cache_seg->segment.seg_id;
+	void *seg_addr;
+
+	seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id);
+	seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index;
+
+	return seg_info_addr;
+}
+
+static void cache_seg_info_write(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_segment_info *seg_info_addr;
+	struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info;
+
+	mutex_lock(&cache_seg->info_lock);
+	seg_info->header.seq++;
+	seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info));
+
+	seg_info_addr = get_seg_info_addr(cache_seg);
+	memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info));
+	pmem_wmb();
+
+	cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX;
+	mutex_unlock(&cache_seg->info_lock);
+}
+
+static int cache_seg_info_load(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr;
+	struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev;
+	struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
+	u32 seg_id = cache_seg->segment.seg_id;
+	int ret = 0;
+
+	cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id);
+
+	mutex_lock(&cache_seg->info_lock);
+	cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header,
+						sizeof(struct pcache_segment_info),
+						PCACHE_SEG_INFO_SIZE,
+						&cache_seg->cache_seg_info);
+	if (IS_ERR(cache_seg_info_addr)) {
+		ret = PTR_ERR(cache_seg_info_addr);
+		goto out;
+	} else if (!cache_seg_info_addr) {
+		ret = -EIO;
+		goto out;
+	}
+	cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base;
+out:
+	mutex_unlock(&cache_seg->info_lock);
+
+	if (ret)
+		pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n",
+			      cache_seg->segment.seg_id, ret);
+	return ret;
+}
+
+static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
+	struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr;
+	int ret = 0;
+
+	cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header,
+					     sizeof(struct pcache_cache_seg_gen),
+					     sizeof(struct pcache_cache_seg_gen),
+					     &cache_seg_gen);
+	if (IS_ERR(cache_seg_gen_addr)) {
+		ret = PTR_ERR(cache_seg_gen_addr);
+		goto out;
+	}
+
+	if (!cache_seg_gen_addr) {
+		cache_seg->gen = 0;
+		cache_seg->gen_seq = 0;
+		cache_seg->gen_index = 0;
+		goto out;
+	}
+
+	cache_seg->gen = cache_seg_gen.gen;
+	cache_seg->gen_seq = cache_seg_gen.header.seq;
+	cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen);
+out:
+
+	return ret;
+}
+
+static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
+
+	return (cache_seg_ctrl->gen + cache_seg->gen_index);
+}
+
+/*
+ * cache_seg_ctrl_write - write cache segment control information
+ * @seg: the cache segment to update
+ *
+ * This function writes the control information of a cache segment to media.
+ *
+ * Although this updates shared control data, we intentionally do not use
+ * any locking here.  All accesses to control information are single-threaded:
+ *
+ *   - All reads occur during the init phase, where no concurrent writes
+ *     can happen.
+ *   - Writes happen once during init and once when the last reference
+ *     to the segment is dropped in cache_seg_put().
+ *
+ * Both cases are guaranteed to be single-threaded, so there is no risk
+ * of concurrent read/write races.
+ */
+static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_cache_seg_gen cache_seg_gen;
+
+	cache_seg_gen.gen = cache_seg->gen;
+	cache_seg_gen.header.seq = ++cache_seg->gen_seq;
+	cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header,
+						 sizeof(struct pcache_cache_seg_gen));
+
+	memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen));
+	pmem_wmb();
+
+	cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX;
+}
+
+static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg)
+{
+	cache_seg->gen = 0;
+	cache_seg->gen_seq = 0;
+	cache_seg->gen_index = 0;
+	cache_seg_ctrl_write(cache_seg);
+}
+
+static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg)
+{
+	int ret;
+
+	ret = cache_seg_info_load(cache_seg);
+	if (ret)
+		goto err;
+
+	ret = cache_seg_ctrl_load(cache_seg);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	return ret;
+}
+
+/**
+ * cache_seg_set_next_seg - Sets the ID of the next segment
+ * @cache_seg: Pointer to the cache segment structure.
+ * @seg_id: The segment ID to set as the next segment.
+ *
+ * A pcache_cache allocates multiple cache segments, which are linked together
+ * through next_seg. When loading a pcache_cache, the first cache segment can
+ * be found using cache->seg_id, which allows access to all the cache segments.
+ */
+void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id)
+{
+	cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT;
+	cache_seg->cache_seg_info.next_seg = seg_id;
+	cache_seg_info_write(cache_seg);
+}
+
+int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
+		   bool new_cache)
+{
+	struct pcache_cache_dev *cache_dev = cache->cache_dev;
+	struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id];
+	struct pcache_segment_init_options seg_options = { 0 };
+	struct pcache_segment *segment = &cache_seg->segment;
+	int ret;
+
+	cache_seg->cache = cache;
+	cache_seg->cache_seg_id = cache_seg_id;
+	spin_lock_init(&cache_seg->gen_lock);
+	atomic_set(&cache_seg->refs, 0);
+	mutex_init(&cache_seg->info_lock);
+
+	/* init pcache_segment */
+	seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA;
+	seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE;
+	seg_options.seg_id = seg_id;
+	seg_options.seg_info = &cache_seg->cache_seg_info;
+	pcache_segment_init(cache_dev, segment, &seg_options);
+
+	cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF;
+
+	if (new_cache) {
+		cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id),
+				     PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX +
+				     PCACHE_CACHE_SEG_CTRL_SIZE);
+
+		cache_seg_ctrl_init(cache_seg);
+
+		cache_seg->info_index = 0;
+		cache_seg_info_write(cache_seg);
+
+		/* clear outdated kset in segment */
+		memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia));
+		pmem_wmb();
+	} else {
+		ret = cache_seg_meta_load(cache_seg);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+err:
+	return ret;
+}
+
+/**
+ * get_cache_segment - Retrieves a free cache segment from the cache.
+ * @cache: Pointer to the cache structure.
+ *
+ * This function attempts to find a free cache segment that can be used.
+ * It locks the segment map and checks for the next available segment ID.
+ * If a free segment is found, it initializes it and returns a pointer to the
+ * cache segment structure. Returns NULL if no segments are available.
+ */
+struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache)
+{
+	struct pcache_cache_segment *cache_seg;
+	u32 seg_id;
+
+	spin_lock(&cache->seg_map_lock);
+again:
+	seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg);
+	if (seg_id == cache->n_segs) {
+		/* reset the hint of ->last_cache_seg and retry */
+		if (cache->last_cache_seg) {
+			cache->last_cache_seg = 0;
+			goto again;
+		}
+		cache->cache_full = true;
+		spin_unlock(&cache->seg_map_lock);
+		return NULL;
+	}
+
+	/*
+	 * found an available cache_seg, mark it used in seg_map
+	 * and update the search hint ->last_cache_seg
+	 */
+	__set_bit(seg_id, cache->seg_map);
+	cache->last_cache_seg = seg_id;
+	spin_unlock(&cache->seg_map_lock);
+
+	cache_seg = &cache->segments[seg_id];
+	cache_seg->cache_seg_id = seg_id;
+
+	return cache_seg;
+}
+
+static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg)
+{
+	spin_lock(&cache_seg->gen_lock);
+	cache_seg->gen++;
+	spin_unlock(&cache_seg->gen_lock);
+
+	cache_seg_ctrl_write(cache_seg);
+}
+
+void cache_seg_get(struct pcache_cache_segment *cache_seg)
+{
+	atomic_inc(&cache_seg->refs);
+}
+
+static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg)
+{
+	struct pcache_cache *cache;
+
+	cache = cache_seg->cache;
+	cache_seg_gen_increase(cache_seg);
+
+	spin_lock(&cache->seg_map_lock);
+	if (cache->cache_full)
+		cache->cache_full = false;
+	__clear_bit(cache_seg->cache_seg_id, cache->seg_map);
+	spin_unlock(&cache->seg_map_lock);
+
+	pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache));
+	/* clean_work will clean the bad key in key_tree*/
+	queue_work(cache_get_wq(cache), &cache->clean_work);
+}
+
+void cache_seg_put(struct pcache_cache_segment *cache_seg)
+{
+	if (atomic_dec_and_test(&cache_seg->refs))
+		cache_seg_invalidate(cache_seg);
+}
diff --git a/drivers/md/dm-pcache/cache_writeback.c b/drivers/md/dm-pcache/cache_writeback.c
new file mode 100644
index 000000000000..87a82b3fe836
--- /dev/null
+++ b/drivers/md/dm-pcache/cache_writeback.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/bio.h>
+
+#include "cache.h"
+#include "backing_dev.h"
+#include "cache_dev.h"
+#include "dm_pcache.h"
+
+static void writeback_ctx_end(struct pcache_cache *cache, int ret)
+{
+	if (ret && !cache->writeback_ctx.ret) {
+		pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret);
+		cache->writeback_ctx.ret = ret;
+	}
+
+	if (!atomic_dec_and_test(&cache->writeback_ctx.pending))
+		return;
+
+	if (!cache->writeback_ctx.ret) {
+		backing_dev_flush(cache->backing_dev);
+
+		mutex_lock(&cache->dirty_tail_lock);
+		cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance);
+		cache_encode_dirty_tail(cache);
+		mutex_unlock(&cache->dirty_tail_lock);
+	}
+	queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
+}
+
+static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret)
+{
+	struct pcache_cache *cache = backing_req->priv_data;
+
+	mutex_lock(&cache->writeback_lock);
+	writeback_ctx_end(cache, ret);
+	mutex_unlock(&cache->writeback_lock);
+}
+
+static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	u32 to_copy;
+	void *addr;
+	int ret;
+
+	addr = cache_pos_addr(dirty_tail);
+	kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+
+	to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off);
+	ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy);
+	if (ret) {
+		pcache_dev_err(pcache, "error to read kset: %d", ret);
+		return true;
+	}
+
+	/* Check if the magic number matches the expected value */
+	if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
+		pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n",
+				dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+				kset_onmedia->magic, PCACHE_KSET_MAGIC);
+		return true;
+	}
+
+	/* Verify the CRC checksum for data integrity */
+	if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
+		pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n",
+				dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
+				cache_kset_crc(kset_onmedia), kset_onmedia->crc);
+		return true;
+	}
+
+	return false;
+}
+
+void cache_writeback_exit(struct pcache_cache *cache)
+{
+	cancel_delayed_work_sync(&cache->writeback_work);
+	backing_dev_flush(cache->backing_dev);
+	cache_tree_exit(&cache->writeback_key_tree);
+}
+
+int cache_writeback_init(struct pcache_cache *cache)
+{
+	int ret;
+
+	ret = cache_tree_init(cache, &cache->writeback_key_tree, 1);
+	if (ret)
+		goto err;
+
+	atomic_set(&cache->writeback_ctx.pending, 0);
+
+	/* Queue delayed work to start writeback handling */
+	queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
+
+	return 0;
+err:
+	return ret;
+}
+
+static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key)
+{
+	struct pcache_backing_dev_req *writeback_req;
+	struct pcache_backing_dev_req_opts writeback_req_opts = { 0 };
+	struct pcache_cache_pos *pos;
+	void *addr;
+	u32 seg_remain, req_len, done = 0;
+
+	if (cache_key_clean(key))
+		return;
+
+	pos = &key->cache_pos;
+
+	seg_remain = cache_seg_remain(pos);
+	BUG_ON(seg_remain < key->len);
+next_req:
+	addr = cache_pos_addr(pos) + done;
+	req_len = backing_dev_req_coalesced_max_len(addr, key->len - done);
+
+	writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM;
+	writeback_req_opts.gfp_mask = GFP_NOIO;
+	writeback_req_opts.end_fn = writeback_end_req;
+	writeback_req_opts.priv_data = cache;
+
+	writeback_req_opts.kmem.data = addr;
+	writeback_req_opts.kmem.opf = REQ_OP_WRITE;
+	writeback_req_opts.kmem.len = req_len;
+	writeback_req_opts.kmem.backing_off = key->off + done;
+
+	writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts);
+
+	atomic_inc(&cache->writeback_ctx.pending);
+	backing_dev_req_submit(writeback_req, true);
+
+	done += req_len;
+	if (done < key->len)
+		goto next_req;
+}
+
+static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance)
+{
+	struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree;
+	struct pcache_cache_subtree *cache_subtree;
+	struct rb_node *node;
+	struct pcache_cache_key *key;
+	u32 i;
+
+	cache->writeback_ctx.ret = 0;
+	cache->writeback_ctx.advance = advance;
+	atomic_set(&cache->writeback_ctx.pending, 1);
+
+	for (i = 0; i < cache_tree->n_subtrees; i++) {
+		cache_subtree = &cache_tree->subtrees[i];
+
+		node = rb_first(&cache_subtree->root);
+		while (node) {
+			key = CACHE_KEY(node);
+			node = rb_next(node);
+
+			cache_key_writeback(cache, key);
+			cache_key_delete(key);
+		}
+	}
+	writeback_ctx_end(cache, 0);
+}
+
+static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
+{
+	struct pcache_cache_key_onmedia *key_onmedia;
+	struct pcache_cache_subtree *cache_subtree;
+	struct pcache_cache_key *key;
+	int ret;
+	u32 i;
+
+	/* Iterate through all keys in the kset and write each back to storage */
+	for (i = 0; i < kset_onmedia->key_num; i++) {
+		key_onmedia = &kset_onmedia->data[i];
+
+		key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO);
+		ret = cache_key_decode(cache, key_onmedia, key);
+		if (ret) {
+			cache_key_put(key);
+			goto clear_tree;
+		}
+
+		cache_subtree = get_subtree(&cache->writeback_key_tree, key->off);
+		spin_lock(&cache_subtree->tree_lock);
+		cache_key_insert(&cache->writeback_key_tree, key, true);
+		spin_unlock(&cache_subtree->tree_lock);
+	}
+
+	return 0;
+clear_tree:
+	cache_tree_clear(&cache->writeback_key_tree);
+	return ret;
+}
+
+static void last_kset_writeback(struct pcache_cache *cache,
+		struct pcache_cache_kset_onmedia *last_kset_onmedia)
+{
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_segment *next_seg;
+
+	pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id);
+
+	next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id];
+
+	mutex_lock(&cache->dirty_tail_lock);
+	cache->dirty_tail.cache_seg = next_seg;
+	cache->dirty_tail.seg_off = 0;
+	cache_encode_dirty_tail(cache);
+	mutex_unlock(&cache->dirty_tail_lock);
+}
+
+void cache_writeback_fn(struct work_struct *work)
+{
+	struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work);
+	struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
+	struct pcache_cache_pos dirty_tail;
+	struct pcache_cache_kset_onmedia *kset_onmedia;
+	u32 delay;
+	int ret;
+
+	mutex_lock(&cache->writeback_lock);
+	if (atomic_read(&cache->writeback_ctx.pending))
+		goto unlock;
+
+	if (pcache_is_stopping(pcache))
+		goto unlock;
+
+	kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
+
+	mutex_lock(&cache->dirty_tail_lock);
+	cache_pos_copy(&dirty_tail, &cache->dirty_tail);
+	mutex_unlock(&cache->dirty_tail_lock);
+
+	if (is_cache_clean(cache, &dirty_tail)) {
+		delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
+		goto queue_work;
+	}
+
+	if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
+		last_kset_writeback(cache, kset_onmedia);
+		delay = 0;
+		goto queue_work;
+	}
+
+	ret = cache_kset_insert_tree(cache, kset_onmedia);
+	if (ret) {
+		delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
+		goto queue_work;
+	}
+
+	cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia));
+	delay = 0;
+queue_work:
+	queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay);
+unlock:
+	mutex_unlock(&cache->writeback_lock);
+}
diff --git a/drivers/md/dm-pcache/dm_pcache.c b/drivers/md/dm-pcache/dm_pcache.c
new file mode 100644
index 000000000000..e5f5936fa6f0
--- /dev/null
+++ b/drivers/md/dm-pcache/dm_pcache.c
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include "../dm-core.h"
+#include "cache_dev.h"
+#include "backing_dev.h"
+#include "cache.h"
+#include "dm_pcache.h"
+
+void pcache_defer_reqs_kick(struct dm_pcache *pcache)
+{
+	struct pcache_cache *cache = &pcache->cache;
+
+	spin_lock(&cache->seg_map_lock);
+	if (!cache->cache_full)
+		queue_work(pcache->task_wq, &pcache->defered_req_work);
+	spin_unlock(&cache->seg_map_lock);
+}
+
+static void defer_req(struct pcache_request *pcache_req)
+{
+	struct dm_pcache *pcache = pcache_req->pcache;
+
+	BUG_ON(!list_empty(&pcache_req->list_node));
+
+	spin_lock(&pcache->defered_req_list_lock);
+	list_add(&pcache_req->list_node, &pcache->defered_req_list);
+	pcache_defer_reqs_kick(pcache);
+	spin_unlock(&pcache->defered_req_list_lock);
+}
+
+static void defered_req_fn(struct work_struct *work)
+{
+	struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work);
+	struct pcache_request *pcache_req;
+	LIST_HEAD(tmp_list);
+	int ret;
+
+	if (pcache_is_stopping(pcache))
+		return;
+
+	spin_lock(&pcache->defered_req_list_lock);
+	list_splice_init(&pcache->defered_req_list, &tmp_list);
+	spin_unlock(&pcache->defered_req_list_lock);
+
+	while (!list_empty(&tmp_list)) {
+		pcache_req = list_first_entry(&tmp_list,
+					    struct pcache_request, list_node);
+		list_del_init(&pcache_req->list_node);
+		pcache_req->ret = 0;
+		ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
+		if (ret == -EBUSY)
+			defer_req(pcache_req);
+		else
+			pcache_req_put(pcache_req, ret);
+	}
+}
+
+void pcache_req_get(struct pcache_request *pcache_req)
+{
+	kref_get(&pcache_req->ref);
+}
+
+static void end_req(struct kref *ref)
+{
+	struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref);
+	struct dm_pcache *pcache = pcache_req->pcache;
+	struct bio *bio = pcache_req->bio;
+	int ret = pcache_req->ret;
+
+	if (ret == -EBUSY) {
+		pcache_req_get(pcache_req);
+		defer_req(pcache_req);
+	} else {
+		bio->bi_status = errno_to_blk_status(ret);
+		bio_endio(bio);
+
+		if (atomic_dec_and_test(&pcache->inflight_reqs))
+			wake_up(&pcache->inflight_wq);
+	}
+}
+
+void pcache_req_put(struct pcache_request *pcache_req, int ret)
+{
+	/* Set the return status if it is not already set */
+	if (ret && !pcache_req->ret)
+		pcache_req->ret = ret;
+
+	kref_put(&pcache_req->ref, end_req);
+}
+
+static bool at_least_one_arg(struct dm_arg_set *as, char **error)
+{
+	if (!as->argc) {
+		*error = "Insufficient args";
+		return false;
+	}
+
+	return true;
+}
+
+static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
+				char **error)
+{
+	int ret;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+	ret = dm_get_device(pcache->ti, dm_shift_arg(as),
+			  BLK_OPEN_READ | BLK_OPEN_WRITE,
+			  &pcache->cache_dev.dm_dev);
+	if (ret) {
+		*error = "Error opening cache device";
+		return ret;
+	}
+
+	return 0;
+}
+
+static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
+				char **error)
+{
+	int ret;
+
+	if (!at_least_one_arg(as, error))
+		return -EINVAL;
+
+	ret = dm_get_device(pcache->ti, dm_shift_arg(as),
+			  BLK_OPEN_READ | BLK_OPEN_WRITE,
+			  &pcache->backing_dev.dm_dev);
+	if (ret) {
+		*error = "Error opening backing device";
+		return ret;
+	}
+
+	return 0;
+}
+
+static void pcache_init_opts(struct pcache_cache_options *opts)
+{
+	opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
+	opts->data_crc = false;
+}
+
+static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as,
+			    char **error)
+{
+	struct pcache_cache_options *opts = &pcache->opts;
+	static const struct dm_arg _args[] = {
+		{0, 4, "Invalid number of cache option arguments"},
+	};
+	unsigned int argc;
+	const char *arg;
+	int ret;
+
+	pcache_init_opts(opts);
+	if (!as->argc)
+		return 0;
+
+	ret = dm_read_arg_group(_args, as, &argc, error);
+	if (ret)
+		return -EINVAL;
+
+	while (argc) {
+		arg = dm_shift_arg(as);
+		argc--;
+
+		if (!strcmp(arg, "cache_mode")) {
+			arg = dm_shift_arg(as);
+			if (!strcmp(arg, "writeback")) {
+				opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
+			} else {
+				*error = "Invalid cache mode parameter";
+				return -EINVAL;
+			}
+			argc--;
+		} else if (!strcmp(arg, "data_crc")) {
+			arg = dm_shift_arg(as);
+			if (!strcmp(arg, "true")) {
+				opts->data_crc = true;
+			} else if (!strcmp(arg, "false")) {
+				opts->data_crc = false;
+			} else {
+				*error = "Invalid data crc parameter";
+				return -EINVAL;
+			}
+			argc--;
+		} else {
+			*error = "Unrecognised cache option requested";
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int pcache_start(struct dm_pcache *pcache, char **error)
+{
+	int ret;
+
+	ret = cache_dev_start(pcache);
+	if (ret) {
+		*error = "Failed to start cache dev";
+		return ret;
+	}
+
+	ret = backing_dev_start(pcache);
+	if (ret) {
+		*error = "Failed to start backing dev";
+		goto stop_cache;
+	}
+
+	ret = pcache_cache_start(pcache);
+	if (ret) {
+		*error = "Failed to start pcache";
+		goto stop_backing;
+	}
+
+	return 0;
+stop_backing:
+	backing_dev_stop(pcache);
+stop_cache:
+	cache_dev_stop(pcache);
+
+	return ret;
+}
+
+static void pcache_destroy_args(struct dm_pcache *pcache)
+{
+	if (pcache->cache_dev.dm_dev)
+		dm_put_device(pcache->ti, pcache->cache_dev.dm_dev);
+	if (pcache->backing_dev.dm_dev)
+		dm_put_device(pcache->ti, pcache->backing_dev.dm_dev);
+}
+
+static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv,
+				char **error)
+{
+	struct dm_arg_set as;
+	int ret;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	/*
+	 * Parse cache device
+	 */
+	ret = parse_cache_dev(pcache, &as, error);
+	if (ret)
+		return ret;
+	/*
+	 * Parse backing device
+	 */
+	ret = parse_backing_dev(pcache, &as, error);
+	if (ret)
+		goto out;
+	/*
+	 * Parse optional arguments
+	 */
+	ret = parse_cache_opts(pcache, &as, error);
+	if (ret)
+		goto out;
+
+	return 0;
+out:
+	pcache_destroy_args(pcache);
+	return ret;
+}
+
+static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct mapped_device *md = ti->table->md;
+	struct dm_pcache *pcache;
+	int ret;
+
+	if (md->map) {
+		ti->error = "Don't support table loading for live md";
+		return -EOPNOTSUPP;
+	}
+
+	/* Allocate memory for the cache structure */
+	pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL);
+	if (!pcache)
+		return -ENOMEM;
+
+	pcache->task_wq = alloc_workqueue("pcache-%s-wq",  WQ_UNBOUND | WQ_MEM_RECLAIM,
+					  0, md->name);
+	if (!pcache->task_wq) {
+		ret = -ENOMEM;
+		goto free_pcache;
+	}
+
+	spin_lock_init(&pcache->defered_req_list_lock);
+	INIT_LIST_HEAD(&pcache->defered_req_list);
+	INIT_WORK(&pcache->defered_req_work, defered_req_fn);
+	pcache->ti = ti;
+
+	ret = pcache_parse_args(pcache, argc, argv, &ti->error);
+	if (ret)
+		goto destroy_wq;
+
+	ret = pcache_start(pcache, &ti->error);
+	if (ret)
+		goto destroy_args;
+
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+	ti->per_io_data_size = sizeof(struct pcache_request);
+	ti->private = pcache;
+	atomic_set(&pcache->inflight_reqs, 0);
+	atomic_set(&pcache->state, PCACHE_STATE_RUNNING);
+	init_waitqueue_head(&pcache->inflight_wq);
+
+	return 0;
+destroy_args:
+	pcache_destroy_args(pcache);
+destroy_wq:
+	destroy_workqueue(pcache->task_wq);
+free_pcache:
+	kfree(pcache);
+
+	return ret;
+}
+
+static void defer_req_stop(struct dm_pcache *pcache)
+{
+	struct pcache_request *pcache_req;
+	LIST_HEAD(tmp_list);
+
+	flush_work(&pcache->defered_req_work);
+
+	spin_lock(&pcache->defered_req_list_lock);
+	list_splice_init(&pcache->defered_req_list, &tmp_list);
+	spin_unlock(&pcache->defered_req_list_lock);
+
+	while (!list_empty(&tmp_list)) {
+		pcache_req = list_first_entry(&tmp_list,
+					    struct pcache_request, list_node);
+		list_del_init(&pcache_req->list_node);
+		pcache_req_put(pcache_req, -EIO);
+	}
+}
+
+static void dm_pcache_dtr(struct dm_target *ti)
+{
+	struct dm_pcache *pcache;
+
+	pcache = ti->private;
+	atomic_set(&pcache->state, PCACHE_STATE_STOPPING);
+	defer_req_stop(pcache);
+
+	wait_event(pcache->inflight_wq,
+			atomic_read(&pcache->inflight_reqs) == 0);
+
+	pcache_cache_stop(pcache);
+	backing_dev_stop(pcache);
+	cache_dev_stop(pcache);
+
+	pcache_destroy_args(pcache);
+	drain_workqueue(pcache->task_wq);
+	destroy_workqueue(pcache->task_wq);
+
+	kfree(pcache);
+}
+
+static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request));
+	struct dm_pcache *pcache = ti->private;
+	int ret;
+
+	pcache_req->pcache = pcache;
+	kref_init(&pcache_req->ref);
+	pcache_req->ret = 0;
+	pcache_req->bio = bio;
+	pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	pcache_req->data_len = bio->bi_iter.bi_size;
+	INIT_LIST_HEAD(&pcache_req->list_node);
+	atomic_inc(&pcache->inflight_reqs);
+
+	ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
+	if (ret == -EBUSY)
+		defer_req(pcache_req);
+	else
+		pcache_req_put(pcache_req, ret);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static void dm_pcache_status(struct dm_target *ti, status_type_t type,
+			     unsigned int status_flags, char *result,
+			     unsigned int maxlen)
+{
+	struct dm_pcache *pcache = ti->private;
+	struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
+	struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
+	struct pcache_cache *cache = &pcache->cache;
+	unsigned int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u",
+		       cache_dev->sb_flags,
+		       cache_dev->seg_num,
+		       cache->n_segs,
+		       bitmap_weight(cache->seg_map, cache->n_segs),
+		       pcache_cache_get_gc_percent(cache),
+		       cache->cache_info.flags,
+		       cache->key_head.cache_seg->cache_seg_id,
+		       cache->key_head.seg_off,
+		       cache->dirty_tail.cache_seg->cache_seg_id,
+		       cache->dirty_tail.seg_off,
+		       cache->key_tail.cache_seg->cache_seg_id,
+		       cache->key_tail.seg_off);
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %s 4 cache_mode writeback crc %s",
+		       cache_dev->dm_dev->name,
+		       backing_dev->dm_dev->name,
+		       cache_data_crc_on(cache) ? "true" : "false");
+		break;
+	case STATUSTYPE_IMA:
+		*result = '\0';
+		break;
+	}
+}
+
+static int dm_pcache_message(struct dm_target *ti, unsigned int argc,
+			     char **argv, char *result, unsigned int maxlen)
+{
+	struct dm_pcache *pcache = ti->private;
+	unsigned long val;
+
+	if (argc != 2)
+		goto err;
+
+	if (!strcasecmp(argv[0], "gc_percent")) {
+		if (kstrtoul(argv[1], 10, &val))
+			goto err;
+
+		return pcache_cache_set_gc_percent(&pcache->cache, val);
+	}
+err:
+	return -EINVAL;
+}
+
+static struct target_type dm_pcache_target = {
+	.name		= "pcache",
+	.version	= {0, 1, 0},
+	.module		= THIS_MODULE,
+	.features	= DM_TARGET_SINGLETON,
+	.ctr		= dm_pcache_ctr,
+	.dtr		= dm_pcache_dtr,
+	.map		= dm_pcache_map_bio,
+	.status		= dm_pcache_status,
+	.message	= dm_pcache_message,
+};
+
+static int __init dm_pcache_init(void)
+{
+	int ret;
+
+	ret = pcache_backing_init();
+	if (ret)
+		goto err;
+
+	ret = pcache_cache_init();
+	if (ret)
+		goto backing_exit;
+
+	ret = dm_register_target(&dm_pcache_target);
+	if (ret)
+		goto cache_exit;
+	return 0;
+
+cache_exit:
+	pcache_cache_exit();
+backing_exit:
+	pcache_backing_exit();
+err:
+	return ret;
+}
+module_init(dm_pcache_init);
+
+static void __exit dm_pcache_exit(void)
+{
+	dm_unregister_target(&dm_pcache_target);
+	pcache_cache_exit();
+	pcache_backing_exit();
+}
+module_exit(dm_pcache_exit);
+
+MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device");
+MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-pcache/dm_pcache.h b/drivers/md/dm-pcache/dm_pcache.h
new file mode 100644
index 000000000000..b4e06be0c0b9
--- /dev/null
+++ b/drivers/md/dm-pcache/dm_pcache.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _DM_PCACHE_H
+#define _DM_PCACHE_H
+#include <linux/device-mapper.h>
+
+#include "../dm-core.h"
+
+#define CACHE_DEV_TO_PCACHE(cache_dev)		(container_of(cache_dev, struct dm_pcache, cache_dev))
+#define BACKING_DEV_TO_PCACHE(backing_dev)	(container_of(backing_dev, struct dm_pcache, backing_dev))
+#define CACHE_TO_PCACHE(cache)			(container_of(cache, struct dm_pcache, cache))
+
+#define PCACHE_STATE_RUNNING			1
+#define PCACHE_STATE_STOPPING			2
+
+struct pcache_cache_dev;
+struct pcache_backing_dev;
+struct pcache_cache;
+struct pcache_cache_options;
+struct dm_pcache {
+	struct dm_target *ti;
+	struct pcache_cache_dev cache_dev;
+	struct pcache_backing_dev backing_dev;
+	struct pcache_cache cache;
+	struct pcache_cache_options opts;
+
+	spinlock_t			defered_req_list_lock;
+	struct list_head		defered_req_list;
+	struct workqueue_struct		*task_wq;
+
+	struct work_struct		defered_req_work;
+
+	atomic_t			state;
+	atomic_t			inflight_reqs;
+	wait_queue_head_t		inflight_wq;
+};
+
+static inline bool pcache_is_stopping(struct dm_pcache *pcache)
+{
+	return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING);
+}
+
+#define pcache_dev_err(pcache, fmt, ...)							\
+	pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+#define pcache_dev_info(pcache, fmt, ...)							\
+	pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+#define pcache_dev_debug(pcache, fmt, ...)							\
+	pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
+
+struct pcache_request {
+	struct dm_pcache	*pcache;
+	struct bio		*bio;
+
+	u64			off;
+	u32			data_len;
+
+	struct kref		ref;
+	int			ret;
+
+	struct list_head	list_node;
+};
+
+void pcache_req_get(struct pcache_request *pcache_req);
+void pcache_req_put(struct pcache_request *pcache_req, int ret);
+
+void pcache_defer_reqs_kick(struct dm_pcache *pcache);
+
+#endif /* _DM_PCACHE_H */
diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h
new file mode 100644
index 000000000000..d427e534727c
--- /dev/null
+++ b/drivers/md/dm-pcache/pcache_internal.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_INTERNAL_H
+#define _PCACHE_INTERNAL_H
+
+#include <linux/delay.h>
+#include <linux/crc32c.h>
+
+#define pcache_err(fmt, ...)							\
+	pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+#define pcache_info(fmt, ...)							\
+	pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+#define pcache_debug(fmt, ...)							\
+	pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
+
+#define PCACHE_KB			(1024ULL)
+#define PCACHE_MB			(1024 * PCACHE_KB)
+
+/* Maximum number of metadata indices */
+#define PCACHE_META_INDEX_MAX		2
+
+#define PCACHE_CRC_SEED			0x3B15A
+/*
+ * struct pcache_meta_header - PCACHE metadata header structure
+ * @crc: CRC checksum for validating metadata integrity.
+ * @seq: Sequence number to track metadata updates.
+ * @version: Metadata version.
+ * @res: Reserved space for future use.
+ */
+struct pcache_meta_header {
+	__u32 crc;
+	__u8  seq;
+	__u8  version;
+	__u16 res;
+};
+
+/*
+ * pcache_meta_crc - Calculate CRC for the given metadata header.
+ * @header: Pointer to the metadata header.
+ * @meta_size: Size of the metadata structure.
+ *
+ * Returns the CRC checksum calculated by excluding the CRC field itself.
+ */
+static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size)
+{
+	return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4);
+}
+
+/*
+ * pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow.
+ * @seq1: First sequence number.
+ * @seq2: Second sequence number.
+ *
+ * Determines if @seq1 is more recent than @seq2 by calculating the signed
+ * difference between them. This approach allows handling sequence number
+ * overflow correctly because the difference wraps naturally, and any value
+ * greater than zero indicates that @seq1 is "after" @seq2. This method
+ * assumes 8-bit unsigned sequence numbers, where the difference wraps
+ * around if seq1 overflows past seq2.
+ *
+ * Returns:
+ *   - true if @seq1 is more recent than @seq2, indicating it comes "after"
+ *   - false otherwise.
+ */
+static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2)
+{
+	return (s8)(seq1 - seq2) > 0;
+}
+
+/*
+ * pcache_meta_find_latest - Find the latest valid metadata.
+ * @header: Pointer to the metadata header.
+ * @meta_size: Size of each metadata block.
+ *
+ * Finds the latest valid metadata by checking sequence numbers. If a
+ * valid entry with the highest sequence number is found, its pointer
+ * is returned. Returns NULL if no valid metadata is found.
+ */
+static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header,
+					u32 meta_size, u32 meta_max_size,
+					void *meta_ret)
+{
+	struct pcache_meta_header *meta, *latest = NULL;
+	u32 i, seq_latest = 0;
+	void *meta_addr;
+
+	meta = meta_ret;
+
+	for (i = 0; i < PCACHE_META_INDEX_MAX; i++) {
+		meta_addr = (void *)header + (i * meta_max_size);
+		if (copy_mc_to_kernel(meta, meta_addr, meta_size)) {
+			pcache_err("hardware memory error when copy meta");
+			return ERR_PTR(-EIO);
+		}
+
+		/* Skip if CRC check fails, which means corrupted */
+		if (meta->crc != pcache_meta_crc(meta, meta_size))
+			continue;
+
+		/* Update latest if a more recent sequence is found */
+		if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) {
+			seq_latest = meta->seq;
+			latest = (void *)header + (i * meta_max_size);
+		}
+	}
+
+	if (!latest)
+		return NULL;
+
+	if (copy_mc_to_kernel(meta_ret, latest, meta_size)) {
+		pcache_err("hardware memory error");
+		return ERR_PTR(-EIO);
+	}
+
+	return latest;
+}
+
+#endif /* _PCACHE_INTERNAL_H */
diff --git a/drivers/md/dm-pcache/segment.c b/drivers/md/dm-pcache/segment.c
new file mode 100644
index 000000000000..7e9818701445
--- /dev/null
+++ b/drivers/md/dm-pcache/segment.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/dax.h>
+
+#include "pcache_internal.h"
+#include "cache_dev.h"
+#include "segment.h"
+
+int segment_copy_to_bio(struct pcache_segment *segment,
+		u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
+{
+	struct iov_iter iter;
+	size_t copied;
+	void *src;
+
+	iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx],
+			bio_segments(bio), bio->bi_iter.bi_size);
+	iter.iov_offset = bio->bi_iter.bi_bvec_done;
+	if (bio_off)
+		iov_iter_advance(&iter, bio_off);
+
+	src = segment->data + data_off;
+	copied = _copy_mc_to_iter(src, data_len, &iter);
+	if (copied != data_len)
+		return -EIO;
+
+	return 0;
+}
+
+int segment_copy_from_bio(struct pcache_segment *segment,
+		u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
+{
+	struct iov_iter iter;
+	size_t copied;
+	void *dst;
+
+	iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx],
+			bio_segments(bio), bio->bi_iter.bi_size);
+	iter.iov_offset = bio->bi_iter.bi_bvec_done;
+	if (bio_off)
+		iov_iter_advance(&iter, bio_off);
+
+	dst = segment->data + data_off;
+	copied = _copy_from_iter_flushcache(dst, data_len, &iter);
+	if (copied != data_len)
+		return -EIO;
+	pmem_wmb();
+
+	return 0;
+}
+
+void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
+		      struct pcache_segment_init_options *options)
+{
+	segment->seg_info = options->seg_info;
+	segment_info_set_type(segment->seg_info, options->type);
+
+	segment->cache_dev = cache_dev;
+	segment->seg_id = options->seg_id;
+	segment->data_size = PCACHE_SEG_SIZE - options->data_off;
+	segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off;
+}
diff --git a/drivers/md/dm-pcache/segment.h b/drivers/md/dm-pcache/segment.h
new file mode 100644
index 000000000000..deca1ddcb02b
--- /dev/null
+++ b/drivers/md/dm-pcache/segment.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PCACHE_SEGMENT_H
+#define _PCACHE_SEGMENT_H
+
+#include <linux/bio.h>
+#include <linux/bitfield.h>
+
+#include "pcache_internal.h"
+
+struct pcache_segment_info {
+	struct pcache_meta_header	header;
+	__u32			flags;
+	__u32			next_seg;
+};
+
+#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT		BIT(0)
+
+#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK         GENMASK(4, 1)
+#define PCACHE_SEGMENT_TYPE_CACHE_DATA		1
+
+static inline bool segment_info_has_next(struct pcache_segment_info *seg_info)
+{
+	return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT);
+}
+
+static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type)
+{
+	seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK;
+	seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type);
+}
+
+static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info)
+{
+	return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags);
+}
+
+struct pcache_segment_pos {
+	struct pcache_segment	*segment;	/* Segment associated with the position */
+	u32			off;		/* Offset within the segment */
+};
+
+struct pcache_segment_init_options {
+	u8			type;
+	u32			seg_id;
+	u32			data_off;
+
+	struct pcache_segment_info	*seg_info;
+};
+
+struct pcache_segment {
+	struct pcache_cache_dev	*cache_dev;
+
+	void			*data;
+	u32			data_size;
+	u32			seg_id;
+
+	struct pcache_segment_info	*seg_info;
+};
+
+int segment_copy_to_bio(struct pcache_segment *segment,
+		      u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
+int segment_copy_from_bio(struct pcache_segment *segment,
+			u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
+
+static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len)
+{
+	BUG_ON(seg_pos->off + len > seg_pos->segment->data_size);
+
+	seg_pos->off += len;
+}
+
+void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
+		      struct pcache_segment_init_options *options);
+#endif /* _PCACHE_SEGMENT_H */
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 0a1788fed68c..c6f7129e43d3 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3247,7 +3247,7 @@ size_check:
 	rs_reset_inconclusive_reshape(rs);
 
 	/* Start raid set read-only and assumed clean to change in raid_resume() */
-	rs->md.ro = 1;
+	rs->md.ro = MD_RDONLY;
 	rs->md.in_sync = 1;
 
 	/* Has to be held on running the array */
@@ -3385,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r
 	/* The MD sync thread can be done with io or be interrupted but still be running */
 	if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
 	    (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
-	     (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
+	     (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
 		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
 			return st_reshape;
 
@@ -3775,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
 		} else
 			return -EINVAL;
 	}
-	if (mddev->ro == 2) {
+	if (mddev->ro == MD_AUTO_READ) {
 		/* A write to sync_action is enough to justify
 		 * canceling read-auto mode
 		 */
-		mddev->ro = 0;
+		mddev->ro = MD_RDWR;
 		if (!mddev->suspended)
 			md_wakeup_thread(mddev->sync_thread);
 	}
@@ -3860,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti)
 		 */
 		md_stop_writes(&rs->md);
 		mddev_suspend(&rs->md, false);
+		rs->md.ro = MD_RDONLY;
 	}
 }
 
@@ -3972,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs)
 	int ro = mddev->ro;
 
 	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
-	mddev->ro = 0;
+	mddev->ro = MD_RDWR;
 	md_update_sb(mddev, 1);
 	mddev->ro = ro;
 }
@@ -4131,7 +4132,7 @@ static void raid_resume(struct dm_target *ti)
 		WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread,
 						       lockdep_is_held(&mddev->reconfig_mutex)));
 		clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
-		mddev->ro = 0;
+		mddev->ro = MD_RDWR;
 		mddev->in_sync = 0;
 		md_unfrozen_sync_thread(mddev);
 		mddev_unlock_and_resume(mddev);
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index a4550975c27d..e9b47b659976 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create(
 	rh->shift = RH_HASH_SHIFT;
 	rh->prime = RH_HASH_MULT;
 
-	rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets)));
+	rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets));
 	if (!rh->buckets) {
 		DMERR("unable to allocate region hash bucket memory");
 		kfree(rh);
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index bb1a70b5a215..50a52ca50b34 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths)
 		return -EINVAL;
 	}
 
-	sctx->region_table = vmalloc(array_size(nr_slots,
-						sizeof(region_table_slot_t)));
+	sctx->region_table = vmalloc_array(nr_slots,
+					   sizeof(region_table_slot_t));
 	if (!sctx->region_table) {
 		ti->error = "Cannot allocate region table";
 		return -ENOMEM;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 2af5a9514c05..8fede41adec0 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 static struct target_type error_target = {
 	.name = "error",
 	.version = {1, 7, 0},
-	.features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM,
+	.features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM |
+		DM_TARGET_PASSES_INTEGRITY,
 	.ctr  = io_err_ctr,
 	.dtr  = io_err_dtr,
 	.map  = io_err_map,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 007bb93e5fca..c84149ba4e38 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3031,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 	}
 
 	pool->cell_sort_array =
-		vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
-				   sizeof(*pool->cell_sort_array)));
+		vmalloc_array(CELL_SORT_ARRAY_SIZE,
+			      sizeof(*pool->cell_sort_array));
 	if (!pool->cell_sort_array) {
 		*error = "Error allocating cell sort array";
 		err_p = ERR_PTR(-ENOMEM);
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 810002747091..262e11581f2d 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -17,6 +17,7 @@
 #include <linux/minmax.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
+#include <linux/string.h>
 #include <linux/wait.h>
 
 #include "logger.h"
@@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb
 	vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY);
 }
 
-static bool is_zero_block(char *block)
-{
-	int i;
-
-	for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
-		if (*((u64 *) &block[i]))
-			return false;
-	}
-
-	return true;
-}
-
 static void copy_from_bio(struct bio *bio, char *data_ptr)
 {
 	struct bio_vec biovec;
@@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b
 		 * we acknowledge the bio.
 		 */
 		copy_from_bio(bio, data_vio->vio.data);
-		data_vio->is_zero = is_zero_block(data_vio->vio.data);
+		data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE);
 		data_vio->write = true;
 	}
 
@@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion)
 		copy_from_bio(bio, data + data_vio->offset);
 	}
 
-	data_vio->is_zero = is_zero_block(data);
+	data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE);
 	data_vio->read = false;
 	launch_data_vio_logical_callback(data_vio,
 					 continue_data_vio_with_block_map_slot);
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
index 12f954a0c532..afb062e1f1fb 100644
--- a/drivers/md/dm-vdo/indexer/volume-index.c
+++ b/drivers/md/dm-vdo/indexer/volume-index.c
@@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
 				    "%zu bytes decoded of %zu expected", offset,
 				    sizeof(buffer));
 		if (result != VDO_SUCCESS)
-			result = UDS_CORRUPT_DATA;
+			return UDS_CORRUPT_DATA;
 
 		if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
@@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index,
 				    "%zu bytes decoded of %zu expected", offset,
 				    sizeof(buffer));
 		if (result != VDO_SUCCESS)
-			result = UDS_CORRUPT_DATA;
+			return UDS_CORRUPT_DATA;
 
 		if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
 			return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7bd6fa05b00a..f5e5e59b232b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -490,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 
-static inline bool bio_is_flush_with_data(struct bio *bio)
-{
-	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
-}
-
 static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
 {
 	/*
 	 * If REQ_PREFLUSH set, don't account payload, it will be
 	 * submitted (and accounted) after this flush completes.
 	 */
-	if (bio_is_flush_with_data(bio))
+	if (io->requeue_flush_with_data)
 		return 0;
 	if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
 		return io->sectors;
@@ -590,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g
 	io = container_of(tio, struct dm_io, tio);
 	io->magic = DM_IO_MAGIC;
 	io->status = BLK_STS_OK;
+	io->requeue_flush_with_data = false;
 
 	/* one ref is for submission, the other is for completion */
 	atomic_set(&io->io_count, 2);
@@ -948,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
 	struct mapped_device *md = io->md;
 	blk_status_t io_error;
 	bool requeued;
+	bool requeue_flush_with_data;
 
 	requeued = dm_handle_requeue(io, first_stage);
 	if (requeued && first_stage)
@@ -964,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
 		__dm_start_io_acct(io);
 		dm_end_io_acct(io);
 	}
+	requeue_flush_with_data = io->requeue_flush_with_data;
 	free_io(io);
 	smp_wmb();
 	this_cpu_dec(*md->pending_io);
@@ -976,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
 	if (requeued)
 		return;
 
-	if (bio_is_flush_with_data(bio)) {
+	if (unlikely(requeue_flush_with_data)) {
 		/*
 		 * Preflush done for flush with data, reissue
 		 * without REQ_PREFLUSH.
@@ -1996,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	}
 	init_clone_info(&ci, io, map, bio, is_abnormal);
 
-	if (bio->bi_opf & REQ_PREFLUSH) {
+	if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) {
+		/*
+		 * The "flush_bypasses_map" is set on targets where it is safe
+		 * to skip the map function and submit bios directly to the
+		 * underlying block devices - currently, it is set for dm-linear
+		 * and dm-stripe.
+		 *
+		 * If we have just one underlying device (i.e. there is one
+		 * linear target or multiple linear targets pointing to the same
+		 * device), we can send the flush with data directly to it.
+		 */
+		if (map->flush_bypasses_map) {
+			struct list_head *devices = dm_table_get_devices(map);
+			if (devices->next == devices->prev)
+				goto send_preflush_with_data;
+		}
+		if (bio->bi_iter.bi_size)
+			io->requeue_flush_with_data = true;
 		__send_empty_flush(&ci);
 		/* dm_io_complete submits any data associated with flush */
 		goto out;
 	}
 
+send_preflush_with_data:
 	if (static_branch_unlikely(&zoned_enabled) &&
 	    (bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) {
 		error = __send_zone_reset_all(&ci);
@@ -2908,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 {
 	bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
 	bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
-	int r;
+	int r = 0;
 
 	lockdep_assert_held(&md->suspend_lock);
 
@@ -2960,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 * Stop md->queue before flushing md->wq in case request-based
 	 * dm defers requests to md->wq from md->queue.
 	 */
-	if (dm_request_based(md))
+	if (map && dm_request_based(md)) {
 		dm_stop_queue(md->queue);
+		set_bit(DMF_QUEUE_STOPPED, &md->flags);
+	}
 
 	flush_workqueue(md->wq);
 
@@ -2970,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 * We call dm_wait_for_completion to wait for all existing requests
 	 * to finish.
 	 */
-	r = dm_wait_for_completion(md, task_state);
+	if (map)
+		r = dm_wait_for_completion(md, task_state);
 	if (!r)
 		set_bit(dmf_suspended_flag, &md->flags);
 
@@ -2983,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	if (r < 0) {
 		dm_queue_flush(md);
 
-		if (dm_request_based(md))
+		if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
 			dm_start_queue(md->queue);
 
 		unlock_fs(md);
@@ -3067,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
 	 * so that mapping of targets can work correctly.
 	 * Request-based dm is queueing the deferred I/Os in its request_queue.
 	 */
-	if (dm_request_based(md))
+	if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
 		dm_start_queue(md->queue);
 
 	unlock_fs(md);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1d0e0e7362bd..3fc33b1b4dfb 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9705,6 +9705,8 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp)
 	flags_ext3 = le32_to_cpu(resp->flags_ext3);
 	if (flags_ext3 & FUNC_QCAPS_RESP_FLAGS_EXT3_ROCE_VF_DYN_ALLOC_SUPPORT)
 		bp->fw_cap |= BNXT_FW_CAP_ROCE_VF_DYN_ALLOC_SUPPORT;
+	if (flags_ext3 & FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED)
+		bp->fw_cap |= BNXT_FW_CAP_MIRROR_ON_ROCE;
 
 	bp->tx_push_thresh = 0;
 	if ((flags & FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 06a4c2afdf8a..741b2d854789 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2514,6 +2514,7 @@ struct bnxt {
 	#define BNXT_FW_CAP_VNIC_RE_FLUSH		BIT_ULL(40)
 	#define BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS	BIT_ULL(41)
 	#define BNXT_FW_CAP_NPAR_1_2			BIT_ULL(42)
+	#define BNXT_FW_CAP_MIRROR_ON_ROCE		BIT_ULL(43)
 
 	u32			fw_dbg_cap;
 
@@ -2537,6 +2538,8 @@ struct bnxt {
 	((bp)->fw_cap & BNXT_FW_CAP_ROCE_VF_RESC_MGMT_SUPPORTED)
 #define BNXT_SW_RES_LMT(bp)		\
 	((bp)->fw_cap & BNXT_FW_CAP_SW_MAX_RESOURCE_LIMITS)
+#define BNXT_MIRROR_ON_ROCE_CAP(bp)	\
+	((bp)->fw_cap & BNXT_FW_CAP_MIRROR_ON_ROCE)
 
 	u32			hwrm_spec_code;
 	u16			hwrm_cmd_seq;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index 61cf201bb0dc..f8c2c72b382d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -100,6 +100,12 @@ void bnxt_set_dflt_ulp_stat_ctxs(struct bnxt *bp)
 		if (BNXT_PF(bp) && !bp->pf.port_id &&
 		    bp->port_count > 1)
 			bp->edev->ulp_num_ctxs++;
+
+		/* Reserve one additional stat_ctx when the device is capable
+		 * of supporting port mirroring on RDMA device.
+		 */
+		if (BNXT_MIRROR_ON_ROCE_CAP(bp))
+			bp->edev->ulp_num_ctxs++;
 	}
 }
 
diff --git a/drivers/net/ethernet/pensando/Kconfig b/drivers/net/ethernet/pensando/Kconfig
index 01fe76786f77..c99758adf3ad 100644
--- a/drivers/net/ethernet/pensando/Kconfig
+++ b/drivers/net/ethernet/pensando/Kconfig
@@ -24,6 +24,7 @@ config IONIC
 	select NET_DEVLINK
 	select DIMLIB
 	select PAGE_POOL
+	select AUXILIARY_BUS
 	help
 	  This enables the support for the Pensando family of Ethernet
 	  adapters.  More specific information on this driver can be
diff --git a/drivers/net/ethernet/pensando/ionic/Makefile b/drivers/net/ethernet/pensando/ionic/Makefile
index 4e7642a2d25f..a598972fef41 100644
--- a/drivers/net/ethernet/pensando/ionic/Makefile
+++ b/drivers/net/ethernet/pensando/ionic/Makefile
@@ -5,5 +5,5 @@ obj-$(CONFIG_IONIC) := ionic.o
 
 ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o ionic_dev.o \
 	   ionic_debugfs.o ionic_lif.o ionic_rx_filter.o ionic_ethtool.o \
-	   ionic_txrx.o ionic_stats.o ionic_fw.o
+	   ionic_txrx.o ionic_stats.o ionic_fw.o ionic_aux.o
 ionic-$(CONFIG_PTP_1588_CLOCK) += ionic_phc.o
diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h
index 04f00ea94230..85198e6a806e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic.h
@@ -65,16 +65,9 @@ struct ionic {
 	int watchdog_period;
 };
 
-struct ionic_admin_ctx {
-	struct completion work;
-	union ionic_adminq_cmd cmd;
-	union ionic_adminq_comp comp;
-};
-
 int ionic_adminq_post(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
 int ionic_adminq_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx,
 		      const int err, const bool do_msg);
-int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
 int ionic_adminq_post_wait_nomsg(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
 void ionic_adminq_netdev_err_print(struct ionic_lif *lif, u8 opcode,
 				   u8 status, int err);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_api.h b/drivers/net/ethernet/pensando/ionic/ionic_api.h
new file mode 100644
index 000000000000..bd88666836b8
--- /dev/null
+++ b/drivers/net/ethernet/pensando/ionic/ionic_api.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_API_H_
+#define _IONIC_API_H_
+
+#include <linux/auxiliary_bus.h>
+#include "ionic_if.h"
+#include "ionic_regs.h"
+
+/**
+ * struct ionic_aux_dev - Auxiliary device information
+ * @lif:        Logical interface
+ * @idx:        Index identifier
+ * @adev:       Auxiliary device
+ */
+struct ionic_aux_dev {
+	struct ionic_lif *lif;
+	int idx;
+	struct auxiliary_device adev;
+};
+
+/**
+ * struct ionic_admin_ctx - Admin command context
+ * @work:       Work completion wait queue element
+ * @cmd:        Admin command (64B) to be copied to the queue
+ * @comp:       Admin completion (16B) copied from the queue
+ */
+struct ionic_admin_ctx {
+	struct completion work;
+	union ionic_adminq_cmd cmd;
+	union ionic_adminq_comp comp;
+};
+
+#define IONIC_INTR_INDEX_NOT_ASSIGNED	-1
+#define IONIC_INTR_NAME_MAX_SZ		32
+
+/**
+ * struct ionic_intr_info - Interrupt information
+ * @name:          Name identifier
+ * @rearm_count:   Interrupt rearm count
+ * @index:         Interrupt index position
+ * @vector:        Interrupt number
+ * @dim_coal_hw:   Interrupt coalesce value in hardware units
+ * @affinity_mask: CPU affinity mask
+ * @aff_notify:    context for notification of IRQ affinity changes
+ */
+struct ionic_intr_info {
+	char name[IONIC_INTR_NAME_MAX_SZ];
+	u64 rearm_count;
+	unsigned int index;
+	unsigned int vector;
+	u32 dim_coal_hw;
+	cpumask_var_t *affinity_mask;
+	struct irq_affinity_notify aff_notify;
+};
+
+/**
+ * ionic_adminq_post_wait - Post an admin command and wait for response
+ * @lif:        Logical interface
+ * @ctx:        API admin command context
+ *
+ * Post the command to an admin queue in the ethernet driver.  If this command
+ * succeeds, then the command has been posted, but that does not indicate a
+ * completion.  If this command returns success, then the completion callback
+ * will eventually be called.
+ *
+ * Return: zero or negative error status
+ */
+int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx);
+
+/**
+ * ionic_error_to_errno - Transform ionic_if errors to os errno
+ * @code:       Ionic error number
+ *
+ * Return:      Negative OS error number or zero
+ */
+int ionic_error_to_errno(enum ionic_status_code code);
+
+/**
+ * ionic_request_rdma_reset - request reset or disable the device or lif
+ * @lif:        Logical interface
+ *
+ * The reset is triggered asynchronously. It will wait until reset request
+ * completes or times out.
+ */
+void ionic_request_rdma_reset(struct ionic_lif *lif);
+
+/**
+ * ionic_intr_alloc - Reserve a device interrupt
+ * @lif:        Logical interface
+ * @intr:       Reserved ionic interrupt structure
+ *
+ * Reserve an interrupt index and get irq number for that index.
+ *
+ * Return: zero or negative error status
+ */
+int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr);
+
+/**
+ * ionic_intr_free - Release a device interrupt index
+ * @lif:        Logical interface
+ * @intr:       Interrupt index
+ *
+ * Mark the interrupt index unused so that it can be reserved again.
+ */
+void ionic_intr_free(struct ionic_lif *lif, int intr);
+
+/**
+ * ionic_get_cmb - Reserve cmb pages
+ * @lif:         Logical interface
+ * @pgid:        First page index
+ * @pgaddr:      First page bus addr (contiguous)
+ * @order:       Log base two number of pages (PAGE_SIZE)
+ * @stride_log2: Size of stride to determine CMB pool
+ * @expdb:       Will be set to true if this CMB region has expdb enabled
+ *
+ * Return: zero or negative error status
+ */
+int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr,
+		  int order, u8 stride_log2, bool *expdb);
+
+/**
+ * ionic_put_cmb - Release cmb pages
+ * @lif:        Logical interface
+ * @pgid:       First page index
+ * @order:      Log base two number of pages (PAGE_SIZE)
+ */
+void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order);
+
+#endif /* _IONIC_API_H_ */
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_aux.c b/drivers/net/ethernet/pensando/ionic/ionic_aux.c
new file mode 100644
index 000000000000..a2be338eb3e5
--- /dev/null
+++ b/drivers/net/ethernet/pensando/ionic/ionic_aux.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#include <linux/kernel.h>
+#include "ionic.h"
+#include "ionic_lif.h"
+#include "ionic_aux.h"
+
+static DEFINE_IDA(aux_ida);
+
+static void ionic_auxbus_release(struct device *dev)
+{
+	struct ionic_aux_dev *ionic_adev;
+
+	ionic_adev = container_of(dev, struct ionic_aux_dev, adev.dev);
+	ida_free(&aux_ida, ionic_adev->adev.id);
+	kfree(ionic_adev);
+}
+
+int ionic_auxbus_register(struct ionic_lif *lif)
+{
+	struct ionic_aux_dev *ionic_adev;
+	struct auxiliary_device *aux_dev;
+	int err, id;
+
+	if (!(le64_to_cpu(lif->ionic->ident.lif.capabilities) & IONIC_LIF_CAP_RDMA))
+		return 0;
+
+	ionic_adev = kzalloc(sizeof(*ionic_adev), GFP_KERNEL);
+	if (!ionic_adev)
+		return -ENOMEM;
+
+	aux_dev = &ionic_adev->adev;
+
+	id = ida_alloc(&aux_ida, GFP_KERNEL);
+	if (id < 0) {
+		dev_err(lif->ionic->dev, "Failed to allocate aux id: %d\n", id);
+		kfree(ionic_adev);
+		return id;
+	}
+
+	aux_dev->id = id;
+	aux_dev->name = "rdma";
+	aux_dev->dev.parent = &lif->ionic->pdev->dev;
+	aux_dev->dev.release = ionic_auxbus_release;
+	ionic_adev->lif = lif;
+	err = auxiliary_device_init(aux_dev);
+	if (err) {
+		dev_err(lif->ionic->dev, "Failed to initialize %s aux device: %d\n",
+			aux_dev->name, err);
+		ida_free(&aux_ida, id);
+		kfree(ionic_adev);
+		return err;
+	}
+
+	err = auxiliary_device_add(aux_dev);
+	if (err) {
+		dev_err(lif->ionic->dev, "Failed to add %s aux device: %d\n",
+			aux_dev->name, err);
+		auxiliary_device_uninit(aux_dev);
+		return err;
+	}
+
+	lif->ionic_adev = ionic_adev;
+	return 0;
+}
+
+void ionic_auxbus_unregister(struct ionic_lif *lif)
+{
+	mutex_lock(&lif->adev_lock);
+	if (!lif->ionic_adev)
+		goto out;
+
+	auxiliary_device_delete(&lif->ionic_adev->adev);
+	auxiliary_device_uninit(&lif->ionic_adev->adev);
+
+	lif->ionic_adev = NULL;
+out:
+	mutex_unlock(&lif->adev_lock);
+}
+
+void ionic_request_rdma_reset(struct ionic_lif *lif)
+{
+	struct ionic *ionic = lif->ionic;
+	int err;
+
+	union ionic_dev_cmd cmd = {
+		.cmd.opcode = IONIC_CMD_RDMA_RESET_LIF,
+		.cmd.lif_index = cpu_to_le16(lif->index),
+	};
+
+	mutex_lock(&ionic->dev_cmd_lock);
+
+	ionic_dev_cmd_go(&ionic->idev, &cmd);
+	err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+
+	mutex_unlock(&ionic->dev_cmd_lock);
+
+	if (err)
+		pr_warn("%s request_reset: error %d\n", __func__, err);
+}
+EXPORT_SYMBOL_NS(ionic_request_rdma_reset, "NET_IONIC");
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_aux.h b/drivers/net/ethernet/pensando/ionic/ionic_aux.h
new file mode 100644
index 000000000000..f5528a9f187d
--- /dev/null
+++ b/drivers/net/ethernet/pensando/ionic/ionic_aux.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc. */
+
+#ifndef _IONIC_AUX_H_
+#define _IONIC_AUX_H_
+
+int ionic_auxbus_register(struct ionic_lif *lif);
+void ionic_auxbus_unregister(struct ionic_lif *lif);
+
+#endif /* _IONIC_AUX_H_ */
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
index 136bfa3516d0..70d86c5f52fb 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c
@@ -9,6 +9,7 @@
 #include "ionic.h"
 #include "ionic_bus.h"
 #include "ionic_lif.h"
+#include "ionic_aux.h"
 #include "ionic_debugfs.h"
 
 /* Supported devices */
@@ -271,6 +272,8 @@ static int ionic_setup_one(struct ionic *ionic)
 	}
 	ionic_debugfs_add_ident(ionic);
 
+	ionic_map_cmb(ionic);
+
 	err = ionic_init(ionic);
 	if (err) {
 		dev_err(dev, "Cannot init device: %d, aborting\n", err);
@@ -375,6 +378,8 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_deregister_devlink;
 	}
 
+	ionic_auxbus_register(ionic->lif);
+
 	mod_timer(&ionic->watchdog_timer,
 		  round_jiffies(jiffies + ionic->watchdog_period));
 	ionic_queue_doorbell_check(ionic, IONIC_NAPI_DEADLINE);
@@ -416,6 +421,7 @@ static void ionic_remove(struct pci_dev *pdev)
 
 		if (ionic->lif->doorbell_wa)
 			cancel_delayed_work_sync(&ionic->doorbell_check_dwork);
+		ionic_auxbus_unregister(ionic->lif);
 		ionic_lif_unregister(ionic->lif);
 		ionic_devlink_unregister(ionic);
 		ionic_lif_deinit(ionic->lif);
@@ -445,6 +451,7 @@ static void ionic_reset_prepare(struct pci_dev *pdev)
 	timer_delete_sync(&ionic->watchdog_timer);
 	cancel_work_sync(&lif->deferred.work);
 
+	ionic_auxbus_unregister(ionic->lif);
 	mutex_lock(&lif->queue_lock);
 	ionic_stop_queues_reconfig(lif);
 	ionic_txrx_free(lif);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
index 093c5358b6e8..ab27e9225c1e 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c
@@ -199,13 +199,201 @@ void ionic_init_devinfo(struct ionic *ionic)
 	dev_dbg(ionic->dev, "fw_version %s\n", idev->dev_info.fw_version);
 }
 
+static void ionic_map_disc_cmb(struct ionic *ionic)
+{
+	struct ionic_identity *ident = &ionic->ident;
+	u32 length_reg0, length, offset, num_regions;
+	struct ionic_dev_bar *bar = ionic->bars;
+	struct ionic_dev *idev = &ionic->idev;
+	struct device *dev = ionic->dev;
+	int err, sz, i;
+	u64 end;
+
+	mutex_lock(&ionic->dev_cmd_lock);
+
+	ionic_dev_cmd_discover_cmb(idev);
+	err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+	if (!err) {
+		sz = min(sizeof(ident->cmb_layout),
+			 sizeof(idev->dev_cmd_regs->data));
+		memcpy_fromio(&ident->cmb_layout,
+			      &idev->dev_cmd_regs->data, sz);
+	}
+	mutex_unlock(&ionic->dev_cmd_lock);
+
+	if (err) {
+		dev_warn(dev, "Cannot discover CMB layout, disabling CMB\n");
+		return;
+	}
+
+	bar += 2;
+
+	num_regions = le32_to_cpu(ident->cmb_layout.num_regions);
+	if (!num_regions || num_regions > IONIC_MAX_CMB_REGIONS) {
+		dev_warn(dev, "Invalid number of CMB entries (%d)\n",
+			 num_regions);
+		return;
+	}
+
+	dev_dbg(dev, "ionic_cmb_layout_identity num_regions %d flags %x:\n",
+		num_regions, ident->cmb_layout.flags);
+
+	for (i = 0; i < num_regions; i++) {
+		offset = le32_to_cpu(ident->cmb_layout.region[i].offset);
+		length = le32_to_cpu(ident->cmb_layout.region[i].length);
+		end = offset + length;
+
+		dev_dbg(dev, "CMB entry %d: bar_num %u cmb_type %u offset %x length %u\n",
+			i, ident->cmb_layout.region[i].bar_num,
+			ident->cmb_layout.region[i].cmb_type,
+			offset, length);
+
+		if (end > (bar->len >> IONIC_CMB_SHIFT_64K)) {
+			dev_warn(dev, "Out of bounds CMB region %d offset %x length %u\n",
+				 i, offset, length);
+			return;
+		}
+	}
+
+	/* if first entry matches PCI config, expdb is not supported */
+	if (ident->cmb_layout.region[0].bar_num == bar->res_index &&
+	    le32_to_cpu(ident->cmb_layout.region[0].length) == bar->len &&
+	    !ident->cmb_layout.region[0].offset) {
+		dev_warn(dev, "No CMB mapping discovered\n");
+		return;
+	}
+
+	/* process first entry for regular mapping */
+	length_reg0 = le32_to_cpu(ident->cmb_layout.region[0].length);
+	if (!length_reg0) {
+		dev_warn(dev, "region len = 0. No CMB mapping discovered\n");
+		return;
+	}
+
+	/* Verify first entry size matches expected 8MB size (in 64KB pages) */
+	if (length_reg0 != IONIC_BAR2_CMB_ENTRY_SIZE >> IONIC_CMB_SHIFT_64K) {
+		dev_warn(dev, "Unexpected CMB size in entry 0: %u pages\n",
+			 length_reg0);
+		return;
+	}
+
+	sz = BITS_TO_LONGS((length_reg0 << IONIC_CMB_SHIFT_64K) /
+			    PAGE_SIZE) * sizeof(long);
+	idev->cmb_inuse = kzalloc(sz, GFP_KERNEL);
+	if (!idev->cmb_inuse) {
+		dev_warn(dev, "No memory for CMB, disabling\n");
+		idev->phy_cmb_pages = 0;
+		idev->phy_cmb_expdb64_pages = 0;
+		idev->phy_cmb_expdb128_pages = 0;
+		idev->phy_cmb_expdb256_pages = 0;
+		idev->phy_cmb_expdb512_pages = 0;
+		idev->cmb_npages = 0;
+		return;
+	}
+
+	for (i = 0; i < num_regions; i++) {
+		/* check this region matches first region length as to
+		 * ease implementation
+		 */
+		if (le32_to_cpu(ident->cmb_layout.region[i].length) !=
+		    length_reg0)
+			continue;
+
+		offset = le32_to_cpu(ident->cmb_layout.region[i].offset);
+
+		switch (ident->cmb_layout.region[i].cmb_type) {
+		case IONIC_CMB_TYPE_DEVMEM:
+			idev->phy_cmb_pages = bar->bus_addr + offset;
+			idev->cmb_npages =
+			    (length_reg0 << IONIC_CMB_SHIFT_64K) / PAGE_SIZE;
+			dev_dbg(dev, "regular cmb mapping: bar->bus_addr %pa region[%d].length %u\n",
+				&bar->bus_addr, i, length);
+			dev_dbg(dev, "idev->phy_cmb_pages %pad, idev->cmb_npages %u\n",
+				&idev->phy_cmb_pages, idev->cmb_npages);
+			break;
+
+		case IONIC_CMB_TYPE_EXPDB64:
+			idev->phy_cmb_expdb64_pages =
+				bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K);
+			dev_dbg(dev, "idev->phy_cmb_expdb64_pages %pad\n",
+				&idev->phy_cmb_expdb64_pages);
+			break;
+
+		case IONIC_CMB_TYPE_EXPDB128:
+			idev->phy_cmb_expdb128_pages =
+				bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K);
+			dev_dbg(dev, "idev->phy_cmb_expdb128_pages %pad\n",
+				&idev->phy_cmb_expdb128_pages);
+			break;
+
+		case IONIC_CMB_TYPE_EXPDB256:
+			idev->phy_cmb_expdb256_pages =
+				bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K);
+			dev_dbg(dev, "idev->phy_cmb_expdb256_pages %pad\n",
+				&idev->phy_cmb_expdb256_pages);
+			break;
+
+		case IONIC_CMB_TYPE_EXPDB512:
+			idev->phy_cmb_expdb512_pages =
+				bar->bus_addr + (offset << IONIC_CMB_SHIFT_64K);
+			dev_dbg(dev, "idev->phy_cmb_expdb512_pages %pad\n",
+				&idev->phy_cmb_expdb512_pages);
+			break;
+
+		default:
+			dev_warn(dev, "[%d] Invalid cmb_type (%d)\n",
+				 i, ident->cmb_layout.region[i].cmb_type);
+			break;
+		}
+	}
+}
+
+static void ionic_map_classic_cmb(struct ionic *ionic)
+{
+	struct ionic_dev_bar *bar = ionic->bars;
+	struct ionic_dev *idev = &ionic->idev;
+	struct device *dev = ionic->dev;
+	int sz;
+
+	bar += 2;
+	/* classic CMB mapping */
+	idev->phy_cmb_pages = bar->bus_addr;
+	idev->cmb_npages = bar->len / PAGE_SIZE;
+	dev_dbg(dev, "classic cmb mapping: bar->bus_addr %pa bar->len %lu\n",
+		&bar->bus_addr, bar->len);
+	dev_dbg(dev, "idev->phy_cmb_pages %pad, idev->cmb_npages %u\n",
+		&idev->phy_cmb_pages, idev->cmb_npages);
+
+	sz = BITS_TO_LONGS(idev->cmb_npages) * sizeof(long);
+	idev->cmb_inuse = kzalloc(sz, GFP_KERNEL);
+	if (!idev->cmb_inuse) {
+		idev->phy_cmb_pages = 0;
+		idev->cmb_npages = 0;
+	}
+}
+
+void ionic_map_cmb(struct ionic *ionic)
+{
+	struct pci_dev *pdev = ionic->pdev;
+	struct device *dev = ionic->dev;
+
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM)) {
+		dev_dbg(dev, "No CMB, disabling\n");
+		return;
+	}
+
+	if (ionic->ident.dev.capabilities & cpu_to_le64(IONIC_DEV_CAP_DISC_CMB))
+		ionic_map_disc_cmb(ionic);
+	else
+		ionic_map_classic_cmb(ionic);
+}
+
 int ionic_dev_setup(struct ionic *ionic)
 {
 	struct ionic_dev_bar *bar = ionic->bars;
 	unsigned int num_bars = ionic->num_bars;
 	struct ionic_dev *idev = &ionic->idev;
 	struct device *dev = ionic->dev;
-	int size;
 	u32 sig;
 	int err;
 
@@ -255,16 +443,11 @@ int ionic_dev_setup(struct ionic *ionic)
 	mutex_init(&idev->cmb_inuse_lock);
 	if (num_bars < 3 || !ionic->bars[IONIC_PCI_BAR_CMB].len) {
 		idev->cmb_inuse = NULL;
+		idev->phy_cmb_pages = 0;
+		idev->cmb_npages = 0;
 		return 0;
 	}
 
-	idev->phy_cmb_pages = bar->bus_addr;
-	idev->cmb_npages = bar->len / PAGE_SIZE;
-	size = BITS_TO_LONGS(idev->cmb_npages) * sizeof(long);
-	idev->cmb_inuse = kzalloc(size, GFP_KERNEL);
-	if (!idev->cmb_inuse)
-		dev_warn(dev, "No memory for CMB, disabling\n");
-
 	return 0;
 }
 
@@ -277,6 +460,11 @@ void ionic_dev_teardown(struct ionic *ionic)
 	idev->phy_cmb_pages = 0;
 	idev->cmb_npages = 0;
 
+	idev->phy_cmb_expdb64_pages = 0;
+	idev->phy_cmb_expdb128_pages = 0;
+	idev->phy_cmb_expdb256_pages = 0;
+	idev->phy_cmb_expdb512_pages = 0;
+
 	if (ionic->wq) {
 		destroy_workqueue(ionic->wq);
 		ionic->wq = NULL;
@@ -698,28 +886,79 @@ void ionic_dev_cmd_adminq_init(struct ionic_dev *idev, struct ionic_qcq *qcq,
 	ionic_dev_cmd_go(idev, &cmd);
 }
 
+void ionic_dev_cmd_discover_cmb(struct ionic_dev *idev)
+{
+	union ionic_dev_cmd cmd = {
+		.discover_cmb.opcode = IONIC_CMD_DISCOVER_CMB,
+	};
+
+	ionic_dev_cmd_go(idev, &cmd);
+}
+
 int ionic_db_page_num(struct ionic_lif *lif, int pid)
 {
 	return (lif->hw_index * lif->dbid_count) + pid;
 }
 
-int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, int order)
+int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr,
+		  int order, u8 stride_log2, bool *expdb)
 {
 	struct ionic_dev *idev = &lif->ionic->idev;
-	int ret;
+	void __iomem *nonexpdb_pgptr;
+	phys_addr_t nonexpdb_pgaddr;
+	int i, idx;
 
 	mutex_lock(&idev->cmb_inuse_lock);
-	ret = bitmap_find_free_region(idev->cmb_inuse, idev->cmb_npages, order);
+	idx = bitmap_find_free_region(idev->cmb_inuse, idev->cmb_npages, order);
 	mutex_unlock(&idev->cmb_inuse_lock);
 
-	if (ret < 0)
-		return ret;
+	if (idx < 0)
+		return idx;
+
+	*pgid = (u32)idx;
+
+	if (idev->phy_cmb_expdb64_pages &&
+	    stride_log2 == IONIC_EXPDB_64B_WQE_LG2) {
+		*pgaddr = idev->phy_cmb_expdb64_pages + idx * PAGE_SIZE;
+		if (expdb)
+			*expdb = true;
+	} else if (idev->phy_cmb_expdb128_pages &&
+		  stride_log2 == IONIC_EXPDB_128B_WQE_LG2) {
+		*pgaddr = idev->phy_cmb_expdb128_pages + idx * PAGE_SIZE;
+		if (expdb)
+			*expdb = true;
+	} else if (idev->phy_cmb_expdb256_pages &&
+		  stride_log2 == IONIC_EXPDB_256B_WQE_LG2) {
+		*pgaddr = idev->phy_cmb_expdb256_pages + idx * PAGE_SIZE;
+		if (expdb)
+			*expdb = true;
+	} else if (idev->phy_cmb_expdb512_pages &&
+		  stride_log2 == IONIC_EXPDB_512B_WQE_LG2) {
+		*pgaddr = idev->phy_cmb_expdb512_pages + idx * PAGE_SIZE;
+		if (expdb)
+			*expdb = true;
+	} else {
+		*pgaddr = idev->phy_cmb_pages + idx * PAGE_SIZE;
+		if (expdb)
+			*expdb = false;
+	}
 
-	*pgid = ret;
-	*pgaddr = idev->phy_cmb_pages + ret * PAGE_SIZE;
+	/* clear the requested CMB region, 1 PAGE_SIZE ioremap at a time */
+	nonexpdb_pgaddr = idev->phy_cmb_pages + idx * PAGE_SIZE;
+	for (i = 0; i < (1 << order); i++) {
+		nonexpdb_pgptr =
+			ioremap_wc(nonexpdb_pgaddr + i * PAGE_SIZE, PAGE_SIZE);
+		if (!nonexpdb_pgptr) {
+			ionic_put_cmb(lif, *pgid, order);
+			return -ENOMEM;
+		}
+		memset_io(nonexpdb_pgptr, 0, PAGE_SIZE);
+		iounmap(nonexpdb_pgptr);
+	}
 
 	return 0;
 }
+EXPORT_SYMBOL_NS(ionic_get_cmb, "NET_IONIC");
 
 void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order)
 {
@@ -729,6 +968,7 @@ void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order)
 	bitmap_release_region(idev->cmb_inuse, pgid, order);
 	mutex_unlock(&idev->cmb_inuse_lock);
 }
+EXPORT_SYMBOL_NS(ionic_put_cmb, "NET_IONIC");
 
 int ionic_cq_init(struct ionic_lif *lif, struct ionic_cq *cq,
 		  struct ionic_intr_info *intr,
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.h b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
index c8c710cfe70c..35566f97eaea 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_dev.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.h
@@ -12,6 +12,7 @@
 
 #include "ionic_if.h"
 #include "ionic_regs.h"
+#include "ionic_api.h"
 
 #define IONIC_MAX_TX_DESC		8192
 #define IONIC_MAX_RX_DESC		16384
@@ -34,6 +35,11 @@
 #define IONIC_RX_MIN_DOORBELL_DEADLINE	(HZ / 100)	/* 10ms */
 #define IONIC_RX_MAX_DOORBELL_DEADLINE	(HZ * 4)	/* 4s */
 
+#define IONIC_EXPDB_64B_WQE_LG2		6
+#define IONIC_EXPDB_128B_WQE_LG2	7
+#define IONIC_EXPDB_256B_WQE_LG2	8
+#define IONIC_EXPDB_512B_WQE_LG2	9
+
 struct ionic_dev_bar {
 	void __iomem *vaddr;
 	phys_addr_t bus_addr;
@@ -170,6 +176,11 @@ struct ionic_dev {
 	dma_addr_t phy_cmb_pages;
 	u32 cmb_npages;
 
+	dma_addr_t phy_cmb_expdb64_pages;
+	dma_addr_t phy_cmb_expdb128_pages;
+	dma_addr_t phy_cmb_expdb256_pages;
+	dma_addr_t phy_cmb_expdb512_pages;
+
 	u32 port_info_sz;
 	struct ionic_port_info *port_info;
 	dma_addr_t port_info_pa;
@@ -273,19 +284,6 @@ struct ionic_queue {
 	char name[IONIC_QUEUE_NAME_MAX_SZ];
 } ____cacheline_aligned_in_smp;
 
-#define IONIC_INTR_INDEX_NOT_ASSIGNED	-1
-#define IONIC_INTR_NAME_MAX_SZ		32
-
-struct ionic_intr_info {
-	char name[IONIC_INTR_NAME_MAX_SZ];
-	u64 rearm_count;
-	unsigned int index;
-	unsigned int vector;
-	u32 dim_coal_hw;
-	cpumask_var_t *affinity_mask;
-	struct irq_affinity_notify aff_notify;
-};
-
 struct ionic_cq {
 	struct ionic_lif *lif;
 	struct ionic_queue *bound_q;
@@ -363,8 +361,8 @@ void ionic_dev_cmd_adminq_init(struct ionic_dev *idev, struct ionic_qcq *qcq,
 
 int ionic_db_page_num(struct ionic_lif *lif, int pid);
 
-int ionic_get_cmb(struct ionic_lif *lif, u32 *pgid, phys_addr_t *pgaddr, int order);
-void ionic_put_cmb(struct ionic_lif *lif, u32 pgid, int order);
+void ionic_dev_cmd_discover_cmb(struct ionic_dev *idev);
+void ionic_map_cmb(struct ionic *ionic);
 
 int ionic_cq_init(struct ionic_lif *lif, struct ionic_cq *cq,
 		  struct ionic_intr_info *intr,
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h
index 9886cd66ce68..47559c909c8b 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_if.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h
@@ -56,6 +56,9 @@ enum ionic_cmd_opcode {
 	IONIC_CMD_VF_SETATTR			= 61,
 	IONIC_CMD_VF_CTRL			= 62,
 
+	/* CMB command */
+	IONIC_CMD_DISCOVER_CMB			= 80,
+
 	/* QoS commands */
 	IONIC_CMD_QOS_CLASS_IDENTIFY		= 240,
 	IONIC_CMD_QOS_CLASS_INIT		= 241,
@@ -269,9 +272,11 @@ union ionic_drv_identity {
 /**
  * enum ionic_dev_capability - Device capabilities
  * @IONIC_DEV_CAP_VF_CTRL:     Device supports VF ctrl operations
+ * @IONIC_DEV_CAP_DISC_CMB:    Device supports CMB discovery operations
  */
 enum ionic_dev_capability {
 	IONIC_DEV_CAP_VF_CTRL        = BIT(0),
+	IONIC_DEV_CAP_DISC_CMB       = BIT(1),
 };
 
 /**
@@ -395,6 +400,7 @@ enum ionic_logical_qtype {
  * @IONIC_Q_F_4X_DESC:      Quadruple main descriptor size
  * @IONIC_Q_F_4X_CQ_DESC:   Quadruple cq descriptor size
  * @IONIC_Q_F_4X_SG_DESC:   Quadruple sg descriptor size
+ * @IONIC_QIDENT_F_EXPDB:   Queue supports express doorbell
  */
 enum ionic_q_feature {
 	IONIC_QIDENT_F_CQ		= BIT_ULL(0),
@@ -407,6 +413,7 @@ enum ionic_q_feature {
 	IONIC_Q_F_4X_DESC		= BIT_ULL(7),
 	IONIC_Q_F_4X_CQ_DESC		= BIT_ULL(8),
 	IONIC_Q_F_4X_SG_DESC		= BIT_ULL(9),
+	IONIC_QIDENT_F_EXPDB		= BIT_ULL(10),
 };
 
 /**
@@ -495,6 +502,16 @@ union ionic_lif_config {
 };
 
 /**
+ * enum ionic_lif_rdma_cap_stats - LIF stat type
+ * @IONIC_LIF_RDMA_STAT_GLOBAL:     Global stats
+ * @IONIC_LIF_RDMA_STAT_QP:         Queue pair stats
+ */
+enum ionic_lif_rdma_cap_stats {
+	IONIC_LIF_RDMA_STAT_GLOBAL = BIT(0),
+	IONIC_LIF_RDMA_STAT_QP = BIT(1),
+};
+
+/**
  * struct ionic_lif_identity - LIF identity information (type-specific)
  *
  * @capabilities:        LIF capabilities
@@ -513,10 +530,10 @@ union ionic_lif_config {
  *	@eth.config:             LIF config struct with features, mtu, mac, q counts
  *
  * @rdma:                RDMA identify structure
- *	@rdma.version:         RDMA version of opcodes and queue descriptors
+ *	@rdma.version:         RDMA capability version
  *	@rdma.qp_opcodes:      Number of RDMA queue pair opcodes supported
  *	@rdma.admin_opcodes:   Number of RDMA admin opcodes supported
- *	@rdma.rsvd:            reserved byte(s)
+ *	@rdma.minor_version:   RDMA capability minor version
  *	@rdma.npts_per_lif:    Page table size per LIF
  *	@rdma.nmrs_per_lif:    Number of memory regions per LIF
  *	@rdma.nahs_per_lif:    Number of address handles per LIF
@@ -526,12 +543,17 @@ union ionic_lif_config {
  *	@rdma.rrq_stride:      Remote RQ work request stride
  *	@rdma.rsq_stride:      Remote SQ work request stride
  *	@rdma.dcqcn_profiles:  Number of DCQCN profiles
- *	@rdma.rsvd_dimensions: reserved byte(s)
+ *	@rdma.udma_shift:      Log2 number of queues per queue group
+ *	@rdma.rsvd_dimensions: Reserved byte
+ *	@rdma.page_size_cap:   Supported page sizes
  *	@rdma.aq_qtype:        RDMA Admin Qtype
  *	@rdma.sq_qtype:        RDMA Send Qtype
  *	@rdma.rq_qtype:        RDMA Receive Qtype
  *	@rdma.cq_qtype:        RDMA Completion Qtype
  *	@rdma.eq_qtype:        RDMA Event Qtype
+ *	@rdma.stats_type:      Supported statistics type
+ *	                       (enum ionic_lif_rdma_cap_stats)
+ *	@rdma.rsvd1:           Reserved byte(s)
  * @words:               word access to struct contents
  */
 union ionic_lif_identity {
@@ -557,7 +579,7 @@ union ionic_lif_identity {
 			u8 version;
 			u8 qp_opcodes;
 			u8 admin_opcodes;
-			u8 rsvd;
+			u8 minor_version;
 			__le32 npts_per_lif;
 			__le32 nmrs_per_lif;
 			__le32 nahs_per_lif;
@@ -567,12 +589,16 @@ union ionic_lif_identity {
 			u8 rrq_stride;
 			u8 rsq_stride;
 			u8 dcqcn_profiles;
-			u8 rsvd_dimensions[10];
+			u8 udma_shift;
+			u8 rsvd_dimensions;
+			__le64 page_size_cap;
 			struct ionic_lif_logical_qtype aq_qtype;
 			struct ionic_lif_logical_qtype sq_qtype;
 			struct ionic_lif_logical_qtype rq_qtype;
 			struct ionic_lif_logical_qtype cq_qtype;
 			struct ionic_lif_logical_qtype eq_qtype;
+			__le16 stats_type;
+			u8 rsvd1[162];
 		} __packed rdma;
 	} __packed;
 	__le32 words[478];
@@ -2195,6 +2221,80 @@ struct ionic_vf_ctrl_comp {
 };
 
 /**
+ * struct ionic_discover_cmb_cmd - CMB discovery command
+ * @opcode: Opcode for the command
+ * @rsvd:   Reserved bytes
+ */
+struct ionic_discover_cmb_cmd {
+	u8	opcode;
+	u8	rsvd[63];
+};
+
+/**
+ * struct ionic_discover_cmb_comp - CMB discover command completion.
+ * @status: Status of the command (enum ionic_status_code)
+ * @rsvd:   Reserved bytes
+ */
+struct ionic_discover_cmb_comp {
+	u8	status;
+	u8	rsvd[15];
+};
+
+#define IONIC_MAX_CMB_REGIONS	16
+#define IONIC_CMB_SHIFT_64K	16
+
+enum ionic_cmb_type {
+	IONIC_CMB_TYPE_DEVMEM	= 0,
+	IONIC_CMB_TYPE_EXPDB64	= 1,
+	IONIC_CMB_TYPE_EXPDB128	= 2,
+	IONIC_CMB_TYPE_EXPDB256	= 3,
+	IONIC_CMB_TYPE_EXPDB512	= 4,
+};
+
+/**
+ * union ionic_cmb_region - Configuration for CMB region
+ * @bar_num:	CMB mapping number from FW
+ * @cmb_type:	Type of CMB this region describes (enum ionic_cmb_type)
+ * @rsvd:	Reserved
+ * @offset:	Offset within BAR in 64KB pages
+ * @length:	Length of the CMB region
+ * @words:	32-bit words for direct access to the entire region
+ */
+union ionic_cmb_region {
+	struct {
+		u8	bar_num;
+		u8	cmb_type;
+		u8	rsvd[6];
+		__le32	offset;
+		__le32	length;
+	} __packed;
+	__le32  words[4];
+};
+
+/**
+ * union ionic_discover_cmb_identity - CMB layout identity structure
+ * @num_regions:    Number of CMB regions, up to 16
+ * @flags:          Feature and capability bits (0 for express
+ *                  doorbell, 1 for 4K alignment indicator,
+ *                  31-24 for version information)
+ * @region:         CMB mappings region, entry 0 for regular
+ *                  mapping, entries 1-7 for WQE sizes 64,
+ *                  128, 256, 512, 1024, 2048 and 4096 bytes
+ * @words:          Full union buffer size
+ */
+union ionic_discover_cmb_identity {
+	struct {
+		__le32 num_regions;
+#define IONIC_CMB_FLAG_EXPDB	BIT(0)
+#define IONIC_CMB_FLAG_4KALIGN	BIT(1)
+#define IONIC_CMB_FLAG_VERSION	0xff000000
+		__le32 flags;
+		union ionic_cmb_region region[IONIC_MAX_CMB_REGIONS];
+	};
+	__le32 words[478];
+};
+
+/**
  * struct ionic_qos_identify_cmd - QoS identify command
  * @opcode:  opcode
  * @ver:     Highest version of identify supported by driver
@@ -3054,6 +3154,8 @@ union ionic_dev_cmd {
 	struct ionic_vf_getattr_cmd vf_getattr;
 	struct ionic_vf_ctrl_cmd vf_ctrl;
 
+	struct ionic_discover_cmb_cmd discover_cmb;
+
 	struct ionic_lif_identify_cmd lif_identify;
 	struct ionic_lif_init_cmd lif_init;
 	struct ionic_lif_reset_cmd lif_reset;
@@ -3093,6 +3195,8 @@ union ionic_dev_cmd_comp {
 	struct ionic_vf_getattr_comp vf_getattr;
 	struct ionic_vf_ctrl_comp vf_ctrl;
 
+	struct ionic_discover_cmb_comp discover_cmb;
+
 	struct ionic_lif_identify_comp lif_identify;
 	struct ionic_lif_init_comp lif_init;
 	ionic_lif_reset_comp lif_reset;
@@ -3234,6 +3338,9 @@ union ionic_adminq_comp {
 #define IONIC_BAR0_DEV_CMD_DATA_REGS_OFFSET	0x0c00
 #define IONIC_BAR0_INTR_STATUS_OFFSET		0x1000
 #define IONIC_BAR0_INTR_CTRL_OFFSET		0x2000
+
+/* BAR2 */
+#define IONIC_BAR2_CMB_ENTRY_SIZE		0x800000
 #define IONIC_DEV_CMD_DONE			0x00000001
 
 #define IONIC_ASIC_TYPE_NONE			0
@@ -3287,6 +3394,7 @@ struct ionic_identity {
 	union ionic_port_identity port;
 	union ionic_qos_identity qos;
 	union ionic_q_identity txq;
+	union ionic_discover_cmb_identity cmb_layout;
 };
 
 #endif /* _IONIC_IF_H_ */
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
index 48cb5d30b5f6..b28966ae50c2 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c
@@ -19,6 +19,7 @@
 #include "ionic_bus.h"
 #include "ionic_dev.h"
 #include "ionic_lif.h"
+#include "ionic_aux.h"
 #include "ionic_txrx.h"
 #include "ionic_ethtool.h"
 #include "ionic_debugfs.h"
@@ -243,29 +244,36 @@ static int ionic_request_irq(struct ionic_lif *lif, struct ionic_qcq *qcq)
 				0, intr->name, &qcq->napi);
 }
 
-static int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr)
+int ionic_intr_alloc(struct ionic_lif *lif, struct ionic_intr_info *intr)
 {
 	struct ionic *ionic = lif->ionic;
-	int index;
+	int index, err;
 
 	index = find_first_zero_bit(ionic->intrs, ionic->nintrs);
-	if (index == ionic->nintrs) {
-		netdev_warn(lif->netdev, "%s: no intr, index=%d nintrs=%d\n",
-			    __func__, index, ionic->nintrs);
+	if (index == ionic->nintrs)
 		return -ENOSPC;
-	}
 
 	set_bit(index, ionic->intrs);
 	ionic_intr_init(&ionic->idev, intr, index);
 
+	err = ionic_bus_get_irq(ionic, intr->index);
+	if (err < 0) {
+		clear_bit(index, ionic->intrs);
+		return err;
+	}
+
+	intr->vector = err;
+
 	return 0;
 }
+EXPORT_SYMBOL_NS(ionic_intr_alloc, "NET_IONIC");
 
-static void ionic_intr_free(struct ionic *ionic, int index)
+void ionic_intr_free(struct ionic_lif *lif, int index)
 {
-	if (index != IONIC_INTR_INDEX_NOT_ASSIGNED && index < ionic->nintrs)
-		clear_bit(index, ionic->intrs);
+	if (index != IONIC_INTR_INDEX_NOT_ASSIGNED && index < lif->ionic->nintrs)
+		clear_bit(index, lif->ionic->intrs);
 }
+EXPORT_SYMBOL_NS(ionic_intr_free, "NET_IONIC");
 
 static void ionic_irq_aff_notify(struct irq_affinity_notify *notify,
 				 const cpumask_t *mask)
@@ -400,7 +408,7 @@ static void ionic_qcq_intr_free(struct ionic_lif *lif, struct ionic_qcq *qcq)
 	irq_set_affinity_hint(qcq->intr.vector, NULL);
 	devm_free_irq(lif->ionic->dev, qcq->intr.vector, &qcq->napi);
 	qcq->intr.vector = 0;
-	ionic_intr_free(lif->ionic, qcq->intr.index);
+	ionic_intr_free(lif, qcq->intr.index);
 	qcq->intr.index = IONIC_INTR_INDEX_NOT_ASSIGNED;
 }
 
@@ -510,13 +518,6 @@ static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qc
 		goto err_out;
 	}
 
-	err = ionic_bus_get_irq(lif->ionic, qcq->intr.index);
-	if (err < 0) {
-		netdev_warn(lif->netdev, "no vector for %s: %d\n",
-			    qcq->q.name, err);
-		goto err_out_free_intr;
-	}
-	qcq->intr.vector = err;
 	ionic_intr_mask_assert(lif->ionic->idev.intr_ctrl, qcq->intr.index,
 			       IONIC_INTR_MASK_SET);
 
@@ -545,7 +546,7 @@ static int ionic_alloc_qcq_interrupt(struct ionic_lif *lif, struct ionic_qcq *qc
 	return 0;
 
 err_out_free_intr:
-	ionic_intr_free(lif->ionic, qcq->intr.index);
+	ionic_intr_free(lif, qcq->intr.index);
 err_out:
 	return err;
 }
@@ -672,7 +673,7 @@ static int ionic_qcq_alloc(struct ionic_lif *lif, unsigned int type,
 			new->cmb_order = order_base_2(new->cmb_q_size / PAGE_SIZE);
 
 			err = ionic_get_cmb(lif, &new->cmb_pgid, &new->cmb_q_base_pa,
-					    new->cmb_order);
+					    new->cmb_order, 0, NULL);
 			if (err) {
 				netdev_err(lif->netdev,
 					   "Cannot allocate queue order %d from cmb: err %d\n",
@@ -740,7 +741,7 @@ err_out_free_q:
 err_out_free_irq:
 	if (flags & IONIC_QCQ_F_INTR) {
 		devm_free_irq(dev, new->intr.vector, &new->napi);
-		ionic_intr_free(lif->ionic, new->intr.index);
+		ionic_intr_free(lif, new->intr.index);
 	}
 err_out_free_page_pool:
 	page_pool_destroy(new->q.page_pool);
@@ -3293,6 +3294,7 @@ int ionic_lif_alloc(struct ionic *ionic)
 
 	mutex_init(&lif->queue_lock);
 	mutex_init(&lif->config_lock);
+	mutex_init(&lif->adev_lock);
 
 	spin_lock_init(&lif->adminq_lock);
 
@@ -3349,6 +3351,7 @@ err_out_free_lif_info:
 	lif->info = NULL;
 	lif->info_pa = 0;
 err_out_free_mutex:
+	mutex_destroy(&lif->adev_lock);
 	mutex_destroy(&lif->config_lock);
 	mutex_destroy(&lif->queue_lock);
 err_out_free_netdev:
@@ -3384,6 +3387,7 @@ static void ionic_lif_handle_fw_down(struct ionic_lif *lif)
 
 	netif_device_detach(lif->netdev);
 
+	ionic_auxbus_unregister(ionic->lif);
 	mutex_lock(&lif->queue_lock);
 	if (test_bit(IONIC_LIF_F_UP, lif->state)) {
 		dev_info(ionic->dev, "Surprise FW stop, stopping queues\n");
@@ -3446,6 +3450,8 @@ int ionic_restart_lif(struct ionic_lif *lif)
 	netif_device_attach(lif->netdev);
 	ionic_queue_doorbell_check(ionic, IONIC_NAPI_DEADLINE);
 
+	ionic_auxbus_register(ionic->lif);
+
 	return 0;
 
 err_txrx_free:
@@ -3528,6 +3534,7 @@ void ionic_lif_free(struct ionic_lif *lif)
 
 	mutex_destroy(&lif->config_lock);
 	mutex_destroy(&lif->queue_lock);
+	mutex_destroy(&lif->adev_lock);
 
 	/* free netdev & lif */
 	ionic_debugfs_del_lif(lif);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.h b/drivers/net/ethernet/pensando/ionic/ionic_lif.h
index e01756fb7fdd..43bdd0fb8733 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_lif.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.h
@@ -10,6 +10,7 @@
 #include <linux/dim.h>
 #include <linux/pci.h>
 #include "ionic_rx_filter.h"
+#include "ionic_api.h"
 
 #define IONIC_ADMINQ_LENGTH	16	/* must be a power of two */
 #define IONIC_NOTIFYQ_LENGTH	64	/* must be a power of two */
@@ -225,6 +226,8 @@ struct ionic_lif {
 	dma_addr_t info_pa;
 	u32 info_sz;
 	struct ionic_qtype_info qtype_info[IONIC_QTYPE_MAX];
+	struct ionic_aux_dev *ionic_adev;
+	struct mutex adev_lock;	/* lock for aux_dev actions */
 
 	u8 rss_hash_key[IONIC_RSS_HASH_KEY_SIZE];
 	u8 *rss_ind_tbl;
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
index 0e60a6bef99a..14dc055be3e9 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
@@ -72,7 +72,7 @@ static const char *ionic_error_to_str(enum ionic_status_code code)
 	}
 }
 
-static int ionic_error_to_errno(enum ionic_status_code code)
+int ionic_error_to_errno(enum ionic_status_code code)
 {
 	switch (code) {
 	case IONIC_RC_SUCCESS:
@@ -114,6 +114,7 @@ static int ionic_error_to_errno(enum ionic_status_code code)
 		return -EIO;
 	}
 }
+EXPORT_SYMBOL_NS(ionic_error_to_errno, "NET_IONIC");
 
 static const char *ionic_opcode_to_str(enum ionic_cmd_opcode opcode)
 {
@@ -480,6 +481,7 @@ int ionic_adminq_post_wait(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
 {
 	return __ionic_adminq_post_wait(lif, ctx, true);
 }
+EXPORT_SYMBOL_NS(ionic_adminq_post_wait, "NET_IONIC");
 
 int ionic_adminq_post_wait_nomsg(struct ionic_lif *lif, struct ionic_admin_ctx *ctx)
 {
diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c
index 4bfd03724ad6..b26a468ddc98 100644
--- a/drivers/scsi/aic94xx/aic94xx_task.c
+++ b/drivers/scsi/aic94xx/aic94xx_task.c
@@ -488,7 +488,6 @@ static int asd_build_ssp_ascb(struct asd_ascb *ascb, struct sas_task *task,
 	scb->ssp_task.conn_handle = cpu_to_le16(
 		(u16)(unsigned long)dev->lldd_dev);
 	scb->ssp_task.data_dir = data_dir_flags[task->data_dir];
-	scb->ssp_task.retry_count = scb->ssp_task.retry_count;
 
 	ascb->tasklet_complete = asd_task_tasklet_complete;
 
diff --git a/drivers/scsi/bfa/bfa_core.c b/drivers/scsi/bfa/bfa_core.c
index a99a101b95ef..2559df8baa05 100644
--- a/drivers/scsi/bfa/bfa_core.c
+++ b/drivers/scsi/bfa/bfa_core.c
@@ -1282,7 +1282,6 @@ bfa_iocfc_cfgrsp(struct bfa_s *bfa)
 	struct bfi_iocfc_cfgrsp_s	*cfgrsp	 = iocfc->cfgrsp;
 	struct bfa_iocfc_fwcfg_s	*fwcfg	 = &cfgrsp->fwcfg;
 
-	fwcfg->num_cqs	      = fwcfg->num_cqs;
 	fwcfg->num_ioim_reqs  = be16_to_cpu(fwcfg->num_ioim_reqs);
 	fwcfg->num_fwtio_reqs = be16_to_cpu(fwcfg->num_fwtio_reqs);
 	fwcfg->num_tskim_reqs = be16_to_cpu(fwcfg->num_tskim_reqs);
diff --git a/drivers/scsi/csiostor/csio_wr.c b/drivers/scsi/csiostor/csio_wr.c
index a516df019c22..010a1df37f15 100644
--- a/drivers/scsi/csiostor/csio_wr.c
+++ b/drivers/scsi/csiostor/csio_wr.c
@@ -960,7 +960,7 @@ csio_wr_copy_to_wrp(void *data_buf, struct csio_wr_pair *wrp,
 	memcpy((uint8_t *) wrp->addr1 + wr_off, data_buf, nbytes);
 	data_len -= nbytes;
 
-	/* Write the remaining data from the begining of circular buffer */
+	/* Write the remaining data from the beginning of circular buffer */
 	if (data_len) {
 		CSIO_DB_ASSERT(data_len <= wrp->size2);
 		CSIO_DB_ASSERT(wrp->addr2 != NULL);
@@ -1224,7 +1224,7 @@ csio_wr_process_iq(struct csio_hw *hw, struct csio_q *q,
 
 	/*
 	 * We need to re-arm SGE interrupts in case we got a stray interrupt,
-	 * especially in msix mode. With INTx, this may be a common occurence.
+	 * especially in msix mode. With INTx, this may be a common occurrence.
 	 */
 	if (unlikely(!q->inc_idx)) {
 		CSIO_INC_STATS(q, n_stray_comp);
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index d1a4cc69d408..30a9c6612651 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -876,7 +876,7 @@ static int hisi_sas_dev_found(struct domain_device *device)
 	device->lldd_dev = sas_dev;
 	hisi_hba->hw->setup_itct(hisi_hba, sas_dev);
 
-	if (parent_dev && dev_is_expander(parent_dev->dev_type)) {
+	if (dev_parent_is_expander(device)) {
 		int phy_no;
 
 		phy_no = sas_find_attached_phy_id(&parent_dev->ex_dev, device);
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index 4431698a5d78..f3516a0611dd 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -925,7 +925,6 @@ static void setup_itct_v2_hw(struct hisi_hba *hisi_hba,
 	struct device *dev = hisi_hba->dev;
 	u64 qw0, device_id = sas_dev->device_id;
 	struct hisi_sas_itct *itct = &hisi_hba->itct[device_id];
-	struct domain_device *parent_dev = device->parent;
 	struct asd_sas_port *sas_port = device->port;
 	struct hisi_sas_port *port = to_hisi_sas_port(sas_port);
 	u64 sas_addr;
@@ -942,7 +941,7 @@ static void setup_itct_v2_hw(struct hisi_hba *hisi_hba,
 		break;
 	case SAS_SATA_DEV:
 	case SAS_SATA_PENDING:
-		if (parent_dev && dev_is_expander(parent_dev->dev_type))
+		if (dev_parent_is_expander(device))
 			qw0 = HISI_SAS_DEV_TYPE_STP << ITCT_HDR_DEV_TYPE_OFF;
 		else
 			qw0 = HISI_SAS_DEV_TYPE_SATA << ITCT_HDR_DEV_TYPE_OFF;
@@ -2494,7 +2493,6 @@ static void prep_ata_v2_hw(struct hisi_hba *hisi_hba,
 {
 	struct sas_task *task = slot->task;
 	struct domain_device *device = task->dev;
-	struct domain_device *parent_dev = device->parent;
 	struct hisi_sas_device *sas_dev = device->lldd_dev;
 	struct hisi_sas_cmd_hdr *hdr = slot->cmd_hdr;
 	struct asd_sas_port *sas_port = device->port;
@@ -2509,7 +2507,7 @@ static void prep_ata_v2_hw(struct hisi_hba *hisi_hba,
 	/* create header */
 	/* dw0 */
 	dw0 = port->id << CMD_HDR_PORT_OFF;
-	if (parent_dev && dev_is_expander(parent_dev->dev_type)) {
+	if (dev_parent_is_expander(device)) {
 		dw0 |= 3 << CMD_HDR_CMD_OFF;
 	} else {
 		phy_id = device->phy->identify.phy_identifier;
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index 2f3d61abab3a..2f9e01717ef3 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -874,7 +874,6 @@ static void setup_itct_v3_hw(struct hisi_hba *hisi_hba,
 	struct device *dev = hisi_hba->dev;
 	u64 qw0, device_id = sas_dev->device_id;
 	struct hisi_sas_itct *itct = &hisi_hba->itct[device_id];
-	struct domain_device *parent_dev = device->parent;
 	struct asd_sas_port *sas_port = device->port;
 	struct hisi_sas_port *port = to_hisi_sas_port(sas_port);
 	u64 sas_addr;
@@ -891,7 +890,7 @@ static void setup_itct_v3_hw(struct hisi_hba *hisi_hba,
 		break;
 	case SAS_SATA_DEV:
 	case SAS_SATA_PENDING:
-		if (parent_dev && dev_is_expander(parent_dev->dev_type))
+		if (dev_parent_is_expander(device))
 			qw0 = HISI_SAS_DEV_TYPE_STP << ITCT_HDR_DEV_TYPE_OFF;
 		else
 			qw0 = HISI_SAS_DEV_TYPE_SATA << ITCT_HDR_DEV_TYPE_OFF;
@@ -1476,7 +1475,6 @@ static void prep_ata_v3_hw(struct hisi_hba *hisi_hba,
 {
 	struct sas_task *task = slot->task;
 	struct domain_device *device = task->dev;
-	struct domain_device *parent_dev = device->parent;
 	struct hisi_sas_device *sas_dev = device->lldd_dev;
 	struct hisi_sas_cmd_hdr *hdr = slot->cmd_hdr;
 	struct asd_sas_port *sas_port = device->port;
@@ -1487,7 +1485,7 @@ static void prep_ata_v3_hw(struct hisi_hba *hisi_hba,
 	u32 dw1 = 0, dw2 = 0;
 
 	hdr->dw0 = cpu_to_le32(port->id << CMD_HDR_PORT_OFF);
-	if (parent_dev && dev_is_expander(parent_dev->dev_type)) {
+	if (dev_parent_is_expander(device)) {
 		hdr->dw0 |= cpu_to_le32(3 << CMD_HDR_CMD_OFF);
 	} else {
 		phy_id = device->phy->identify.phy_identifier;
diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index c73a71ac3c29..3654b12c5d5a 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -2662,10 +2662,8 @@ static void complete_scsi_command(struct CommandList *cp)
 	case CMD_TARGET_STATUS:
 		cmd->result |= ei->ScsiStatus;
 		/* copy the sense data */
-		if (SCSI_SENSE_BUFFERSIZE < sizeof(ei->SenseInfo))
-			sense_data_size = SCSI_SENSE_BUFFERSIZE;
-		else
-			sense_data_size = sizeof(ei->SenseInfo);
+		sense_data_size = min_t(unsigned long, SCSI_SENSE_BUFFERSIZE,
+					sizeof(ei->SenseInfo));
 		if (ei->SenseLen < sense_data_size)
 			sense_data_size = ei->SenseLen;
 		memcpy(cmd->sense_buffer, ei->SenseInfo, sense_data_size);
@@ -3628,10 +3626,7 @@ static bool hpsa_vpd_page_supported(struct ctlr_info *h,
 	if (rc != 0)
 		goto exit_unsupported;
 	pages = buf[3];
-	if ((pages + HPSA_VPD_HEADER_SZ) <= 255)
-		bufsize = pages + HPSA_VPD_HEADER_SZ;
-	else
-		bufsize = 255;
+	bufsize = min(pages + HPSA_VPD_HEADER_SZ, 255);
 
 	/* Get the whole VPD page list */
 	rc = hpsa_scsi_do_inquiry(h, scsi3addr,
@@ -6407,18 +6402,14 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h,
 		return -EINVAL;
 	}
 	if (iocommand->buf_size > 0) {
-		buff = kmalloc(iocommand->buf_size, GFP_KERNEL);
-		if (buff == NULL)
-			return -ENOMEM;
 		if (iocommand->Request.Type.Direction & XFER_WRITE) {
-			/* Copy the data into the buffer we created */
-			if (copy_from_user(buff, iocommand->buf,
-				iocommand->buf_size)) {
-				rc = -EFAULT;
-				goto out_kfree;
-			}
+			buff = memdup_user(iocommand->buf, iocommand->buf_size);
+			if (IS_ERR(buff))
+				return PTR_ERR(buff);
 		} else {
-			memset(buff, 0, iocommand->buf_size);
+			buff = kzalloc(iocommand->buf_size, GFP_KERNEL);
+			if (!buff)
+				return -ENOMEM;
 		}
 	}
 	c = cmd_alloc(h);
@@ -6478,7 +6469,6 @@ static int hpsa_passthru_ioctl(struct ctlr_info *h,
 	}
 out:
 	cmd_free(h, c);
-out_kfree:
 	kfree(buff);
 	return rc;
 }
@@ -6522,18 +6512,21 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h,
 	while (left) {
 		sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
 		buff_size[sg_used] = sz;
-		buff[sg_used] = kmalloc(sz, GFP_KERNEL);
-		if (buff[sg_used] == NULL) {
-			status = -ENOMEM;
-			goto cleanup1;
-		}
+
 		if (ioc->Request.Type.Direction & XFER_WRITE) {
-			if (copy_from_user(buff[sg_used], data_ptr, sz)) {
-				status = -EFAULT;
+			buff[sg_used] = memdup_user(data_ptr, sz);
+			if (IS_ERR(buff[sg_used])) {
+				status = PTR_ERR(buff[sg_used]);
 				goto cleanup1;
 			}
-		} else
-			memset(buff[sg_used], 0, sz);
+		} else {
+			buff[sg_used] = kzalloc(sz, GFP_KERNEL);
+			if (!buff[sg_used]) {
+				status = -ENOMEM;
+				goto cleanup1;
+			}
+		}
+
 		left -= sz;
 		data_ptr += sz;
 		sg_used++;
@@ -7632,8 +7625,8 @@ static void hpsa_free_cfgtables(struct ctlr_info *h)
 }
 
 /* Find and map CISS config table and transfer table
-+ * several items must be unmapped (freed) later
-+ * */
+ * several items must be unmapped (freed) later
+ */
 static int hpsa_find_cfgtables(struct ctlr_info *h)
 {
 	u64 cfg_offset;
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index dd6754db7e4c..44214884deaf 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -4281,11 +4281,11 @@ static int ipr_alloc_dump(struct ipr_ioa_cfg *ioa_cfg)
 	}
 
 	if (ioa_cfg->sis64)
-		ioa_data = vmalloc(array_size(IPR_FMT3_MAX_NUM_DUMP_PAGES,
-					      sizeof(__be32 *)));
+		ioa_data = vmalloc_array(IPR_FMT3_MAX_NUM_DUMP_PAGES,
+					 sizeof(__be32 *));
 	else
-		ioa_data = vmalloc(array_size(IPR_FMT2_MAX_NUM_DUMP_PAGES,
-					      sizeof(__be32 *)));
+		ioa_data = vmalloc_array(IPR_FMT2_MAX_NUM_DUMP_PAGES,
+					 sizeof(__be32 *));
 
 	if (!ioa_data) {
 		ipr_err("Dump memory allocation failed\n");
diff --git a/drivers/scsi/isci/remote_device.c b/drivers/scsi/isci/remote_device.c
index 82deb6a83a8c..4c7462965ea1 100644
--- a/drivers/scsi/isci/remote_device.c
+++ b/drivers/scsi/isci/remote_device.c
@@ -1434,7 +1434,7 @@ static enum sci_status isci_remote_device_construct(struct isci_port *iport,
 	struct domain_device *dev = idev->domain_dev;
 	enum sci_status status;
 
-	if (dev->parent && dev_is_expander(dev->parent->dev_type))
+	if (dev_parent_is_expander(dev))
 		status = sci_remote_device_ea_construct(iport, idev);
 	else
 		status = sci_remote_device_da_construct(iport, idev);
diff --git a/drivers/scsi/libfc/fc_encode.h b/drivers/scsi/libfc/fc_encode.h
index 02e31db31d68..e046091a549a 100644
--- a/drivers/scsi/libfc/fc_encode.h
+++ b/drivers/scsi/libfc/fc_encode.h
@@ -356,7 +356,7 @@ static inline int fc_ct_ms_fill(struct fc_lport *lport,
 		put_unaligned_be16(len, &entry->len);
 		snprintf((char *)&entry->value,
 			FC_FDMI_HBA_ATTR_OSNAMEVERSION_LEN,
-			"%s v%s",
+			"%.62s v%.62s",
 			init_utsname()->sysname,
 			init_utsname()->release);
 
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 869b5d4db44c..d953225f6cc2 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1313,10 +1313,7 @@ static int sas_check_parent_topology(struct domain_device *child)
 	int i;
 	int res = 0;
 
-	if (!child->parent)
-		return 0;
-
-	if (!dev_is_expander(child->parent->dev_type))
+	if (!dev_parent_is_expander(child))
 		return 0;
 
 	parent_ex = &child->parent->ex_dev;
diff --git a/drivers/scsi/lpfc/lpfc.h b/drivers/scsi/lpfc/lpfc.h
index fe4fb67eb50c..224edacf2d8e 100644
--- a/drivers/scsi/lpfc/lpfc.h
+++ b/drivers/scsi/lpfc/lpfc.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term *
+ * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term *
  * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries.     *
  * Copyright (C) 2004-2016 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
@@ -661,15 +661,12 @@ struct lpfc_vport {
 	uint32_t num_disc_nodes;	/* in addition to hba_state */
 	uint32_t gidft_inp;		/* cnt of outstanding GID_FTs */
 
-	uint32_t fc_nlp_cnt;	/* outstanding NODELIST requests */
 	uint32_t fc_rscn_id_cnt;	/* count of RSCNs payloads in list */
 	uint32_t fc_rscn_flush;		/* flag use of fc_rscn_id_list */
 	struct lpfc_dmabuf *fc_rscn_id_list[FC_MAX_HOLD_RSCN];
 	struct lpfc_name fc_nodename;	/* fc nodename */
 	struct lpfc_name fc_portname;	/* fc portname */
 
-	struct lpfc_work_evt disc_timeout_evt;
-
 	struct timer_list fc_disctmo;	/* Discovery rescue timer */
 	uint8_t fc_ns_retry;	/* retries for fabric nameserver */
 	uint32_t fc_prli_sent;	/* cntr for outstanding PRLIs */
@@ -744,12 +741,6 @@ struct lpfc_vport {
 	struct lpfc_vmid_priority_info vmid_priority;
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
-	struct dentry *debug_disc_trc;
-	struct dentry *debug_nodelist;
-	struct dentry *debug_nvmestat;
-	struct dentry *debug_scsistat;
-	struct dentry *debug_ioktime;
-	struct dentry *debug_hdwqstat;
 	struct dentry *vport_debugfs_root;
 	struct lpfc_debugfs_trc *disc_trc;
 	atomic_t disc_trc_cnt;
@@ -767,7 +758,6 @@ struct lpfc_vport {
 	/* There is a single nvme instance per vport. */
 	struct nvme_fc_local_port *localport;
 	uint8_t  nvmei_support; /* driver supports NVME Initiator */
-	uint32_t last_fcp_wqidx;
 	uint32_t rcv_flogi_cnt; /* How many unsol FLOGIs ACK'd. */
 };
 
@@ -1060,8 +1050,6 @@ struct lpfc_hba {
 
 	struct lpfc_dmabuf hbqslimp;
 
-	uint16_t pci_cfg_value;
-
 	uint8_t fc_linkspeed;	/* Link speed after last READ_LA */
 
 	uint32_t fc_eventTag;	/* event tag for link attention */
@@ -1088,7 +1076,6 @@ struct lpfc_hba {
 
 	struct lpfc_stats fc_stat;
 
-	struct lpfc_nodelist fc_fcpnodev; /* nodelist entry for no device */
 	uint32_t nport_event_cnt;	/* timestamp for nlplist entry */
 
 	uint8_t  wwnn[8];
@@ -1229,9 +1216,6 @@ struct lpfc_hba {
 	uint32_t hbq_count;	        /* Count of configured HBQs */
 	struct hbq_s hbqs[LPFC_MAX_HBQS]; /* local copy of hbq indicies  */
 
-	atomic_t fcp_qidx;         /* next FCP WQ (RR Policy) */
-	atomic_t nvme_qidx;        /* next NVME WQ (RR Policy) */
-
 	phys_addr_t pci_bar0_map;     /* Physical address for PCI BAR0 */
 	phys_addr_t pci_bar1_map;     /* Physical address for PCI BAR1 */
 	phys_addr_t pci_bar2_map;     /* Physical address for PCI BAR2 */
@@ -1348,30 +1332,9 @@ struct lpfc_hba {
 	unsigned long last_ramp_down_time;
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
 	struct dentry *hba_debugfs_root;
-	atomic_t debugfs_vport_count;
-	struct dentry *debug_multixri_pools;
-	struct dentry *debug_hbqinfo;
-	struct dentry *debug_dumpHostSlim;
-	struct dentry *debug_dumpHBASlim;
-	struct dentry *debug_InjErrLBA;  /* LBA to inject errors at */
-	struct dentry *debug_InjErrNPortID;  /* NPortID to inject errors at */
-	struct dentry *debug_InjErrWWPN;  /* WWPN to inject errors at */
-	struct dentry *debug_writeGuard; /* inject write guard_tag errors */
-	struct dentry *debug_writeApp;   /* inject write app_tag errors */
-	struct dentry *debug_writeRef;   /* inject write ref_tag errors */
-	struct dentry *debug_readGuard;  /* inject read guard_tag errors */
-	struct dentry *debug_readApp;    /* inject read app_tag errors */
-	struct dentry *debug_readRef;    /* inject read ref_tag errors */
-
-	struct dentry *debug_nvmeio_trc;
+	unsigned int debugfs_vport_count;
+
 	struct lpfc_debugfs_nvmeio_trc *nvmeio_trc;
-	struct dentry *debug_hdwqinfo;
-#ifdef LPFC_HDWQ_LOCK_STAT
-	struct dentry *debug_lockstat;
-#endif
-	struct dentry *debug_cgn_buffer;
-	struct dentry *debug_rx_monitor;
-	struct dentry *debug_ras_log;
 	atomic_t nvmeio_trc_cnt;
 	uint32_t nvmeio_trc_size;
 	uint32_t nvmeio_trc_output_idx;
@@ -1388,19 +1351,10 @@ struct lpfc_hba {
 	sector_t lpfc_injerr_lba;
 #define LPFC_INJERR_LBA_OFF	(sector_t)(-1)
 
-	struct dentry *debug_slow_ring_trc;
 	struct lpfc_debugfs_trc *slow_ring_trc;
 	atomic_t slow_ring_trc_cnt;
 	/* iDiag debugfs sub-directory */
 	struct dentry *idiag_root;
-	struct dentry *idiag_pci_cfg;
-	struct dentry *idiag_bar_acc;
-	struct dentry *idiag_que_info;
-	struct dentry *idiag_que_acc;
-	struct dentry *idiag_drb_acc;
-	struct dentry *idiag_ctl_acc;
-	struct dentry *idiag_mbx_acc;
-	struct dentry *idiag_ext_acc;
 	uint8_t lpfc_idiag_last_eq;
 #endif
 	uint16_t nvmeio_trc_on;
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c
index 7c4d7bb3a56f..92b5b2dbe847 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.c
+++ b/drivers/scsi/lpfc/lpfc_debugfs.c
@@ -2373,93 +2373,117 @@ out:
 
 static ssize_t
 lpfc_debugfs_dif_err_read(struct file *file, char __user *buf,
-	size_t nbytes, loff_t *ppos)
+			  size_t nbytes, loff_t *ppos)
 {
 	struct lpfc_hba *phba = file->private_data;
 	int kind = debugfs_get_aux_num(file);
-	char cbuf[32];
-	uint64_t tmp = 0;
+	char cbuf[32] = {0};
 	int cnt = 0;
 
-	if (kind == writeGuard)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wgrd_cnt);
-	else if (kind == writeApp)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wapp_cnt);
-	else if (kind == writeRef)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_wref_cnt);
-	else if (kind == readGuard)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rgrd_cnt);
-	else if (kind == readApp)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rapp_cnt);
-	else if (kind == readRef)
-		cnt = scnprintf(cbuf, 32, "%u\n", phba->lpfc_injerr_rref_cnt);
-	else if (kind == InjErrNPortID)
-		cnt = scnprintf(cbuf, 32, "0x%06x\n",
+	switch (kind) {
+	case writeGuard:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_wgrd_cnt);
+		break;
+	case writeApp:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_wapp_cnt);
+		break;
+	case writeRef:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_wref_cnt);
+		break;
+	case readGuard:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_rgrd_cnt);
+		break;
+	case readApp:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_rapp_cnt);
+		break;
+	case readRef:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "%u\n",
+				phba->lpfc_injerr_rref_cnt);
+		break;
+	case InjErrNPortID:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "0x%06x\n",
 				phba->lpfc_injerr_nportid);
-	else if (kind == InjErrWWPN) {
-		memcpy(&tmp, &phba->lpfc_injerr_wwpn, sizeof(struct lpfc_name));
-		tmp = cpu_to_be64(tmp);
-		cnt = scnprintf(cbuf, 32, "0x%016llx\n", tmp);
-	} else if (kind == InjErrLBA) {
-		if (phba->lpfc_injerr_lba == (sector_t)(-1))
-			cnt = scnprintf(cbuf, 32, "off\n");
+		break;
+	case InjErrWWPN:
+		cnt = scnprintf(cbuf, sizeof(cbuf), "0x%016llx\n",
+				be64_to_cpu(phba->lpfc_injerr_wwpn.u.wwn_be));
+		break;
+	case InjErrLBA:
+		if (phba->lpfc_injerr_lba == LPFC_INJERR_LBA_OFF)
+			cnt = scnprintf(cbuf, sizeof(cbuf), "off\n");
 		else
-			cnt = scnprintf(cbuf, 32, "0x%llx\n",
-				 (uint64_t) phba->lpfc_injerr_lba);
-	} else
-		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-			 "0547 Unknown debugfs error injection entry\n");
+			cnt = scnprintf(cbuf, sizeof(cbuf), "0x%llx\n",
+					(uint64_t)phba->lpfc_injerr_lba);
+		break;
+	default:
+		lpfc_log_msg(phba, KERN_WARNING, LOG_INIT,
+			     "0547 Unknown debugfs error injection entry\n");
+		break;
+	}
 
 	return simple_read_from_buffer(buf, nbytes, ppos, &cbuf, cnt);
 }
 
 static ssize_t
 lpfc_debugfs_dif_err_write(struct file *file, const char __user *buf,
-	size_t nbytes, loff_t *ppos)
+			   size_t nbytes, loff_t *ppos)
 {
 	struct lpfc_hba *phba = file->private_data;
 	int kind = debugfs_get_aux_num(file);
-	char dstbuf[33];
-	uint64_t tmp = 0;
-	int size;
+	char dstbuf[33] = {0};
+	unsigned long long tmp;
+	unsigned long size;
 
-	memset(dstbuf, 0, 33);
-	size = (nbytes < 32) ? nbytes : 32;
+	size = (nbytes < (sizeof(dstbuf) - 1)) ? nbytes : (sizeof(dstbuf) - 1);
 	if (copy_from_user(dstbuf, buf, size))
 		return -EFAULT;
 
-	if (kind == InjErrLBA) {
-		if ((dstbuf[0] == 'o') && (dstbuf[1] == 'f') &&
-		    (dstbuf[2] == 'f'))
-			tmp = (uint64_t)(-1);
+	if (kstrtoull(dstbuf, 0, &tmp)) {
+		if (kind != InjErrLBA || !strstr(dstbuf, "off"))
+			return -EINVAL;
 	}
 
-	if ((tmp == 0) && (kstrtoull(dstbuf, 0, &tmp)))
-		return -EINVAL;
-
-	if (kind == writeGuard)
+	switch (kind) {
+	case writeGuard:
 		phba->lpfc_injerr_wgrd_cnt = (uint32_t)tmp;
-	else if (kind == writeApp)
+		break;
+	case writeApp:
 		phba->lpfc_injerr_wapp_cnt = (uint32_t)tmp;
-	else if (kind == writeRef)
+		break;
+	case writeRef:
 		phba->lpfc_injerr_wref_cnt = (uint32_t)tmp;
-	else if (kind == readGuard)
+		break;
+	case readGuard:
 		phba->lpfc_injerr_rgrd_cnt = (uint32_t)tmp;
-	else if (kind == readApp)
+		break;
+	case readApp:
 		phba->lpfc_injerr_rapp_cnt = (uint32_t)tmp;
-	else if (kind == readRef)
+		break;
+	case readRef:
 		phba->lpfc_injerr_rref_cnt = (uint32_t)tmp;
-	else if (kind == InjErrLBA)
-		phba->lpfc_injerr_lba = (sector_t)tmp;
-	else if (kind == InjErrNPortID)
+		break;
+	case InjErrLBA:
+		if (strstr(dstbuf, "off"))
+			phba->lpfc_injerr_lba = LPFC_INJERR_LBA_OFF;
+		else
+			phba->lpfc_injerr_lba = (sector_t)tmp;
+		break;
+	case InjErrNPortID:
 		phba->lpfc_injerr_nportid = (uint32_t)(tmp & Mask_DID);
-	else if (kind == InjErrWWPN) {
-		tmp = cpu_to_be64(tmp);
-		memcpy(&phba->lpfc_injerr_wwpn, &tmp, sizeof(struct lpfc_name));
-	} else
-		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-			 "0548 Unknown debugfs error injection entry\n");
-
+		break;
+	case InjErrWWPN:
+		phba->lpfc_injerr_wwpn.u.wwn_be = cpu_to_be64(tmp);
+		break;
+	default:
+		lpfc_log_msg(phba, KERN_WARNING, LOG_INIT,
+			     "0548 Unknown debugfs error injection entry\n");
+		break;
+	}
 	return nbytes;
 }
 
@@ -5728,7 +5752,7 @@ static const struct file_operations lpfc_debugfs_op_slow_ring_trc = {
 };
 
 static struct dentry *lpfc_debugfs_root = NULL;
-static atomic_t lpfc_debugfs_hba_count;
+static unsigned int lpfc_debugfs_hba_count;
 
 /*
  * File operations for the iDiag debugfs
@@ -6050,7 +6074,12 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport)
 	/* Setup lpfc root directory */
 	if (!lpfc_debugfs_root) {
 		lpfc_debugfs_root = debugfs_create_dir("lpfc", NULL);
-		atomic_set(&lpfc_debugfs_hba_count, 0);
+		lpfc_debugfs_hba_count = 0;
+		if (IS_ERR(lpfc_debugfs_root)) {
+			lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT,
+				      "0527 Cannot create debugfs lpfc\n");
+			return;
+		}
 	}
 	if (!lpfc_debugfs_start_time)
 		lpfc_debugfs_start_time = jiffies;
@@ -6061,150 +6090,96 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport)
 		pport_setup = true;
 		phba->hba_debugfs_root =
 			debugfs_create_dir(name, lpfc_debugfs_root);
-		atomic_inc(&lpfc_debugfs_hba_count);
-		atomic_set(&phba->debugfs_vport_count, 0);
+		phba->debugfs_vport_count = 0;
+		if (IS_ERR(phba->hba_debugfs_root)) {
+			lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT,
+				      "0528 Cannot create debugfs %s\n", name);
+			return;
+		}
+		lpfc_debugfs_hba_count++;
 
 		/* Multi-XRI pools */
-		snprintf(name, sizeof(name), "multixripools");
-		phba->debug_multixri_pools =
-			debugfs_create_file(name, S_IFREG | 0644,
-					    phba->hba_debugfs_root,
-					    phba,
-					    &lpfc_debugfs_op_multixripools);
-		if (IS_ERR(phba->debug_multixri_pools)) {
-			lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-					 "0527 Cannot create debugfs multixripools\n");
-			goto debug_failed;
-		}
+		debugfs_create_file("multixripools", 0644,
+				    phba->hba_debugfs_root, phba,
+				    &lpfc_debugfs_op_multixripools);
 
 		/* Congestion Info Buffer */
-		scnprintf(name, sizeof(name), "cgn_buffer");
-		phba->debug_cgn_buffer =
-			debugfs_create_file(name, S_IFREG | 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_cgn_buffer_op);
-		if (IS_ERR(phba->debug_cgn_buffer)) {
-			lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-					 "6527 Cannot create debugfs "
-					 "cgn_buffer\n");
-			goto debug_failed;
-		}
+		debugfs_create_file("cgn_buffer", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_cgn_buffer_op);
 
 		/* RX Monitor */
-		scnprintf(name, sizeof(name), "rx_monitor");
-		phba->debug_rx_monitor =
-			debugfs_create_file(name, S_IFREG | 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_rx_monitor_op);
-		if (IS_ERR(phba->debug_rx_monitor)) {
-			lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-					 "6528 Cannot create debugfs "
-					 "rx_monitor\n");
-			goto debug_failed;
-		}
+		debugfs_create_file("rx_monitor", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_rx_monitor_op);
 
 		/* RAS log */
-		snprintf(name, sizeof(name), "ras_log");
-		phba->debug_ras_log =
-			debugfs_create_file(name, 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_debugfs_ras_log);
-		if (IS_ERR(phba->debug_ras_log)) {
-			lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-					 "6148 Cannot create debugfs"
-					 " ras_log\n");
-			goto debug_failed;
-		}
+		debugfs_create_file("ras_log", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_debugfs_ras_log);
 
 		/* Setup hbqinfo */
-		snprintf(name, sizeof(name), "hbqinfo");
-		phba->debug_hbqinfo =
-			debugfs_create_file(name, S_IFREG | 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_debugfs_op_hbqinfo);
+		debugfs_create_file("hbqinfo", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_debugfs_op_hbqinfo);
 
 #ifdef LPFC_HDWQ_LOCK_STAT
 		/* Setup lockstat */
-		snprintf(name, sizeof(name), "lockstat");
-		phba->debug_lockstat =
-			debugfs_create_file(name, S_IFREG | 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_debugfs_op_lockstat);
-		if (IS_ERR(phba->debug_lockstat)) {
-			lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-					 "4610 Can't create debugfs lockstat\n");
-			goto debug_failed;
-		}
+		debugfs_create_file("lockstat", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_debugfs_op_lockstat);
 #endif
-
-		/* Setup dumpHBASlim */
 		if (phba->sli_rev < LPFC_SLI_REV4) {
-			snprintf(name, sizeof(name), "dumpHBASlim");
-			phba->debug_dumpHBASlim =
-				debugfs_create_file(name,
-					S_IFREG|S_IRUGO|S_IWUSR,
-					phba->hba_debugfs_root,
-					phba, &lpfc_debugfs_op_dumpHBASlim);
-		} else
-			phba->debug_dumpHBASlim = NULL;
+			/* Setup dumpHBASlim */
+			debugfs_create_file("dumpHBASlim", 0644,
+					    phba->hba_debugfs_root, phba,
+					    &lpfc_debugfs_op_dumpHBASlim);
+		}
 
-		/* Setup dumpHostSlim */
 		if (phba->sli_rev < LPFC_SLI_REV4) {
-			snprintf(name, sizeof(name), "dumpHostSlim");
-			phba->debug_dumpHostSlim =
-				debugfs_create_file(name,
-					S_IFREG|S_IRUGO|S_IWUSR,
-					phba->hba_debugfs_root,
-					phba, &lpfc_debugfs_op_dumpHostSlim);
-		} else
-			phba->debug_dumpHostSlim = NULL;
+			/* Setup dumpHostSlim */
+			debugfs_create_file("dumpHostSlim", 0644,
+					    phba->hba_debugfs_root, phba,
+					    &lpfc_debugfs_op_dumpHostSlim);
+		}
 
 		/* Setup DIF Error Injections */
-		phba->debug_InjErrLBA =
-			debugfs_create_file_aux_num("InjErrLBA", 0644,
-			phba->hba_debugfs_root,
-			phba, InjErrLBA, &lpfc_debugfs_op_dif_err);
+		debugfs_create_file_aux_num("InjErrLBA", 0644,
+					    phba->hba_debugfs_root, phba,
+					    InjErrLBA,
+					    &lpfc_debugfs_op_dif_err);
 		phba->lpfc_injerr_lba = LPFC_INJERR_LBA_OFF;
 
-		phba->debug_InjErrNPortID =
-			debugfs_create_file_aux_num("InjErrNPortID", 0644,
-			phba->hba_debugfs_root,
-			phba, InjErrNPortID, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_InjErrWWPN =
-			debugfs_create_file_aux_num("InjErrWWPN", 0644,
-			phba->hba_debugfs_root,
-			phba, InjErrWWPN, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_writeGuard =
-			debugfs_create_file_aux_num("writeGuardInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, writeGuard, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_writeApp =
-			debugfs_create_file_aux_num("writeAppInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, writeApp, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_writeRef =
-			debugfs_create_file_aux_num("writeRefInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, writeRef, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_readGuard =
-			debugfs_create_file_aux_num("readGuardInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, readGuard, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_readApp =
-			debugfs_create_file_aux_num("readAppInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, readApp, &lpfc_debugfs_op_dif_err);
-
-		phba->debug_readRef =
-			debugfs_create_file_aux_num("readRefInjErr", 0644,
-			phba->hba_debugfs_root,
-			phba, readRef, &lpfc_debugfs_op_dif_err);
+		debugfs_create_file_aux_num("InjErrNPortID", 0644,
+					    phba->hba_debugfs_root, phba,
+					    InjErrNPortID,
+					    &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("InjErrWWPN", 0644,
+					    phba->hba_debugfs_root, phba,
+					    InjErrWWPN,
+					    &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("writeGuardInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    writeGuard,
+					    &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("writeAppInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    writeApp, &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("writeRefInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    writeRef, &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("readGuardInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    readGuard,
+					    &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("readAppInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    readApp, &lpfc_debugfs_op_dif_err);
+
+		debugfs_create_file_aux_num("readRefInjErr", 0644,
+					    phba->hba_debugfs_root, phba,
+					    readRef, &lpfc_debugfs_op_dif_err);
 
 		/* Setup slow ring trace */
 		if (lpfc_debugfs_max_slow_ring_trc) {
@@ -6224,11 +6199,9 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport)
 			}
 		}
 
-		snprintf(name, sizeof(name), "slow_ring_trace");
-		phba->debug_slow_ring_trc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				 phba->hba_debugfs_root,
-				 phba, &lpfc_debugfs_op_slow_ring_trc);
+		debugfs_create_file("slow_ring_trace", 0644,
+				    phba->hba_debugfs_root, phba,
+				    &lpfc_debugfs_op_slow_ring_trc);
 		if (!phba->slow_ring_trc) {
 			phba->slow_ring_trc = kcalloc(
 				lpfc_debugfs_max_slow_ring_trc,
@@ -6238,16 +6211,13 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport)
 				lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
 						 "0416 Cannot create debugfs "
 						 "slow_ring buffer\n");
-				goto debug_failed;
+				goto out;
 			}
 			atomic_set(&phba->slow_ring_trc_cnt, 0);
 		}
 
-		snprintf(name, sizeof(name), "nvmeio_trc");
-		phba->debug_nvmeio_trc =
-			debugfs_create_file(name, 0644,
-					    phba->hba_debugfs_root,
-					    phba, &lpfc_debugfs_op_nvmeio_trc);
+		debugfs_create_file("nvmeio_trc", 0644, phba->hba_debugfs_root,
+				    phba, &lpfc_debugfs_op_nvmeio_trc);
 
 		atomic_set(&phba->nvmeio_trc_cnt, 0);
 		if (lpfc_debugfs_max_nvmeio_trc) {
@@ -6293,7 +6263,12 @@ nvmeio_off:
 	if (!vport->vport_debugfs_root) {
 		vport->vport_debugfs_root =
 			debugfs_create_dir(name, phba->hba_debugfs_root);
-		atomic_inc(&phba->debugfs_vport_count);
+		if (IS_ERR(vport->vport_debugfs_root)) {
+			lpfc_vlog_msg(vport, KERN_WARNING, LOG_INIT,
+				      "0529 Cannot create debugfs %s\n", name);
+			return;
+		}
+		phba->debugfs_vport_count++;
 	}
 
 	if (lpfc_debugfs_max_disc_trc) {
@@ -6320,54 +6295,27 @@ nvmeio_off:
 		lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
 				 "0418 Cannot create debugfs disc trace "
 				 "buffer\n");
-		goto debug_failed;
+		goto out;
 	}
 	atomic_set(&vport->disc_trc_cnt, 0);
 
-	snprintf(name, sizeof(name), "discovery_trace");
-	vport->debug_disc_trc =
-		debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				 vport->vport_debugfs_root,
-				 vport, &lpfc_debugfs_op_disc_trc);
-	snprintf(name, sizeof(name), "nodelist");
-	vport->debug_nodelist =
-		debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				 vport->vport_debugfs_root,
-				 vport, &lpfc_debugfs_op_nodelist);
-
-	snprintf(name, sizeof(name), "nvmestat");
-	vport->debug_nvmestat =
-		debugfs_create_file(name, 0644,
-				    vport->vport_debugfs_root,
-				    vport, &lpfc_debugfs_op_nvmestat);
-
-	snprintf(name, sizeof(name), "scsistat");
-	vport->debug_scsistat =
-		debugfs_create_file(name, 0644,
-				    vport->vport_debugfs_root,
-				    vport, &lpfc_debugfs_op_scsistat);
-	if (IS_ERR(vport->debug_scsistat)) {
-		lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-				 "4611 Cannot create debugfs scsistat\n");
-		goto debug_failed;
-	}
+	debugfs_create_file("discovery_trace", 0644, vport->vport_debugfs_root,
+			    vport, &lpfc_debugfs_op_disc_trc);
 
-	snprintf(name, sizeof(name), "ioktime");
-	vport->debug_ioktime =
-		debugfs_create_file(name, 0644,
-				    vport->vport_debugfs_root,
-				    vport, &lpfc_debugfs_op_ioktime);
-	if (IS_ERR(vport->debug_ioktime)) {
-		lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-				 "0815 Cannot create debugfs ioktime\n");
-		goto debug_failed;
-	}
+	debugfs_create_file("nodelist", 0644, vport->vport_debugfs_root, vport,
+			    &lpfc_debugfs_op_nodelist);
+
+	debugfs_create_file("nvmestat", 0644, vport->vport_debugfs_root, vport,
+			    &lpfc_debugfs_op_nvmestat);
 
-	snprintf(name, sizeof(name), "hdwqstat");
-	vport->debug_hdwqstat =
-		debugfs_create_file(name, 0644,
-				    vport->vport_debugfs_root,
-				    vport, &lpfc_debugfs_op_hdwqstat);
+	debugfs_create_file("scsistat", 0644, vport->vport_debugfs_root, vport,
+			    &lpfc_debugfs_op_scsistat);
+
+	debugfs_create_file("ioktime", 0644, vport->vport_debugfs_root, vport,
+			    &lpfc_debugfs_op_ioktime);
+
+	debugfs_create_file("hdwqstat", 0644, vport->vport_debugfs_root, vport,
+			    &lpfc_debugfs_op_hdwqstat);
 
 	/*
 	 * The following section is for additional directories/files for the
@@ -6375,93 +6323,58 @@ nvmeio_off:
 	 */
 
 	if (!pport_setup)
-		goto debug_failed;
+		return;
 
 	/*
 	 * iDiag debugfs root entry points for SLI4 device only
 	 */
 	if (phba->sli_rev < LPFC_SLI_REV4)
-		goto debug_failed;
+		return;
 
-	snprintf(name, sizeof(name), "iDiag");
 	if (!phba->idiag_root) {
 		phba->idiag_root =
-			debugfs_create_dir(name, phba->hba_debugfs_root);
+			debugfs_create_dir("iDiag", phba->hba_debugfs_root);
 		/* Initialize iDiag data structure */
 		memset(&idiag, 0, sizeof(idiag));
 	}
 
 	/* iDiag read PCI config space */
-	snprintf(name, sizeof(name), "pciCfg");
-	if (!phba->idiag_pci_cfg) {
-		phba->idiag_pci_cfg =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_pciCfg);
-		idiag.offset.last_rd = 0;
-	}
+	debugfs_create_file("pciCfg", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_pciCfg);
+	idiag.offset.last_rd = 0;
 
 	/* iDiag PCI BAR access */
-	snprintf(name, sizeof(name), "barAcc");
-	if (!phba->idiag_bar_acc) {
-		phba->idiag_bar_acc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_barAcc);
-		idiag.offset.last_rd = 0;
-	}
+	debugfs_create_file("barAcc", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_barAcc);
+	idiag.offset.last_rd = 0;
 
 	/* iDiag get PCI function queue information */
-	snprintf(name, sizeof(name), "queInfo");
-	if (!phba->idiag_que_info) {
-		phba->idiag_que_info =
-			debugfs_create_file(name, S_IFREG|S_IRUGO,
-			phba->idiag_root, phba, &lpfc_idiag_op_queInfo);
-	}
+	debugfs_create_file("queInfo", 0444, phba->idiag_root, phba,
+			    &lpfc_idiag_op_queInfo);
 
 	/* iDiag access PCI function queue */
-	snprintf(name, sizeof(name), "queAcc");
-	if (!phba->idiag_que_acc) {
-		phba->idiag_que_acc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_queAcc);
-	}
+	debugfs_create_file("queAcc", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_queAcc);
 
 	/* iDiag access PCI function doorbell registers */
-	snprintf(name, sizeof(name), "drbAcc");
-	if (!phba->idiag_drb_acc) {
-		phba->idiag_drb_acc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_drbAcc);
-	}
+	debugfs_create_file("drbAcc", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_drbAcc);
 
 	/* iDiag access PCI function control registers */
-	snprintf(name, sizeof(name), "ctlAcc");
-	if (!phba->idiag_ctl_acc) {
-		phba->idiag_ctl_acc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_ctlAcc);
-	}
+	debugfs_create_file("ctlAcc", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_ctlAcc);
 
 	/* iDiag access mbox commands */
-	snprintf(name, sizeof(name), "mbxAcc");
-	if (!phba->idiag_mbx_acc) {
-		phba->idiag_mbx_acc =
-			debugfs_create_file(name, S_IFREG|S_IRUGO|S_IWUSR,
-				phba->idiag_root, phba, &lpfc_idiag_op_mbxAcc);
-	}
+	debugfs_create_file("mbxAcc", 0644, phba->idiag_root, phba,
+			    &lpfc_idiag_op_mbxAcc);
 
 	/* iDiag extents access commands */
 	if (phba->sli4_hba.extents_in_use) {
-		snprintf(name, sizeof(name), "extAcc");
-		if (!phba->idiag_ext_acc) {
-			phba->idiag_ext_acc =
-				debugfs_create_file(name,
-						    S_IFREG|S_IRUGO|S_IWUSR,
-						    phba->idiag_root, phba,
-						    &lpfc_idiag_op_extAcc);
-		}
+		debugfs_create_file("extAcc", 0644, phba->idiag_root, phba,
+				    &lpfc_idiag_op_extAcc);
 	}
-
-debug_failed:
+out:
+	/* alloc'ed items are kfree'd in lpfc_debugfs_terminate */
 	return;
 #endif
 }
@@ -6486,145 +6399,26 @@ lpfc_debugfs_terminate(struct lpfc_vport *vport)
 	kfree(vport->disc_trc);
 	vport->disc_trc = NULL;
 
-	debugfs_remove(vport->debug_disc_trc); /* discovery_trace */
-	vport->debug_disc_trc = NULL;
-
-	debugfs_remove(vport->debug_nodelist); /* nodelist */
-	vport->debug_nodelist = NULL;
-
-	debugfs_remove(vport->debug_nvmestat); /* nvmestat */
-	vport->debug_nvmestat = NULL;
-
-	debugfs_remove(vport->debug_scsistat); /* scsistat */
-	vport->debug_scsistat = NULL;
-
-	debugfs_remove(vport->debug_ioktime); /* ioktime */
-	vport->debug_ioktime = NULL;
-
-	debugfs_remove(vport->debug_hdwqstat); /* hdwqstat */
-	vport->debug_hdwqstat = NULL;
-
 	if (vport->vport_debugfs_root) {
 		debugfs_remove(vport->vport_debugfs_root); /* vportX */
 		vport->vport_debugfs_root = NULL;
-		atomic_dec(&phba->debugfs_vport_count);
+		phba->debugfs_vport_count--;
 	}
 
-	if (atomic_read(&phba->debugfs_vport_count) == 0) {
-
-		debugfs_remove(phba->debug_multixri_pools); /* multixripools*/
-		phba->debug_multixri_pools = NULL;
-
-		debugfs_remove(phba->debug_hbqinfo); /* hbqinfo */
-		phba->debug_hbqinfo = NULL;
-
-		debugfs_remove(phba->debug_cgn_buffer);
-		phba->debug_cgn_buffer = NULL;
-
-		debugfs_remove(phba->debug_rx_monitor);
-		phba->debug_rx_monitor = NULL;
-
-		debugfs_remove(phba->debug_ras_log);
-		phba->debug_ras_log = NULL;
-
-#ifdef LPFC_HDWQ_LOCK_STAT
-		debugfs_remove(phba->debug_lockstat); /* lockstat */
-		phba->debug_lockstat = NULL;
-#endif
-		debugfs_remove(phba->debug_dumpHBASlim); /* HBASlim */
-		phba->debug_dumpHBASlim = NULL;
-
-		debugfs_remove(phba->debug_dumpHostSlim); /* HostSlim */
-		phba->debug_dumpHostSlim = NULL;
-
-		debugfs_remove(phba->debug_InjErrLBA); /* InjErrLBA */
-		phba->debug_InjErrLBA = NULL;
-
-		debugfs_remove(phba->debug_InjErrNPortID);
-		phba->debug_InjErrNPortID = NULL;
-
-		debugfs_remove(phba->debug_InjErrWWPN); /* InjErrWWPN */
-		phba->debug_InjErrWWPN = NULL;
-
-		debugfs_remove(phba->debug_writeGuard); /* writeGuard */
-		phba->debug_writeGuard = NULL;
-
-		debugfs_remove(phba->debug_writeApp); /* writeApp */
-		phba->debug_writeApp = NULL;
-
-		debugfs_remove(phba->debug_writeRef); /* writeRef */
-		phba->debug_writeRef = NULL;
-
-		debugfs_remove(phba->debug_readGuard); /* readGuard */
-		phba->debug_readGuard = NULL;
-
-		debugfs_remove(phba->debug_readApp); /* readApp */
-		phba->debug_readApp = NULL;
-
-		debugfs_remove(phba->debug_readRef); /* readRef */
-		phba->debug_readRef = NULL;
-
+	if (!phba->debugfs_vport_count) {
 		kfree(phba->slow_ring_trc);
 		phba->slow_ring_trc = NULL;
 
-		/* slow_ring_trace */
-		debugfs_remove(phba->debug_slow_ring_trc);
-		phba->debug_slow_ring_trc = NULL;
-
-		debugfs_remove(phba->debug_nvmeio_trc);
-		phba->debug_nvmeio_trc = NULL;
-
 		kfree(phba->nvmeio_trc);
 		phba->nvmeio_trc = NULL;
 
-		/*
-		 * iDiag release
-		 */
-		if (phba->sli_rev == LPFC_SLI_REV4) {
-			/* iDiag extAcc */
-			debugfs_remove(phba->idiag_ext_acc);
-			phba->idiag_ext_acc = NULL;
-
-			/* iDiag mbxAcc */
-			debugfs_remove(phba->idiag_mbx_acc);
-			phba->idiag_mbx_acc = NULL;
-
-			/* iDiag ctlAcc */
-			debugfs_remove(phba->idiag_ctl_acc);
-			phba->idiag_ctl_acc = NULL;
-
-			/* iDiag drbAcc */
-			debugfs_remove(phba->idiag_drb_acc);
-			phba->idiag_drb_acc = NULL;
-
-			/* iDiag queAcc */
-			debugfs_remove(phba->idiag_que_acc);
-			phba->idiag_que_acc = NULL;
-
-			/* iDiag queInfo */
-			debugfs_remove(phba->idiag_que_info);
-			phba->idiag_que_info = NULL;
-
-			/* iDiag barAcc */
-			debugfs_remove(phba->idiag_bar_acc);
-			phba->idiag_bar_acc = NULL;
-
-			/* iDiag pciCfg */
-			debugfs_remove(phba->idiag_pci_cfg);
-			phba->idiag_pci_cfg = NULL;
-
-			/* Finally remove the iDiag debugfs root */
-			debugfs_remove(phba->idiag_root);
-			phba->idiag_root = NULL;
-		}
-
 		if (phba->hba_debugfs_root) {
 			debugfs_remove(phba->hba_debugfs_root); /* fnX */
 			phba->hba_debugfs_root = NULL;
-			atomic_dec(&lpfc_debugfs_hba_count);
+			lpfc_debugfs_hba_count--;
 		}
 
-		if (atomic_read(&lpfc_debugfs_hba_count) == 0) {
+		if (!lpfc_debugfs_hba_count) {
 			debugfs_remove(lpfc_debugfs_root); /* lpfc */
 			lpfc_debugfs_root = NULL;
 		}
diff --git a/drivers/scsi/lpfc/lpfc_debugfs.h b/drivers/scsi/lpfc/lpfc_debugfs.h
index f319f3af0400..a1464f8ac331 100644
--- a/drivers/scsi/lpfc/lpfc_debugfs.h
+++ b/drivers/scsi/lpfc/lpfc_debugfs.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2017-2022 Broadcom. All Rights Reserved. The term *
+ * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term *
  * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries.     *
  * Copyright (C) 2007-2011 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
@@ -44,6 +44,9 @@
 /* hbqinfo output buffer size */
 #define LPFC_HBQINFO_SIZE 8192
 
+/* hdwqinfo output buffer size */
+#define LPFC_HDWQINFO_SIZE 8192
+
 /* nvmestat output buffer size */
 #define LPFC_NVMESTAT_SIZE 8192
 #define LPFC_IOKTIME_SIZE 8192
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index fca81e0c7c2e..b71db7d7d747 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -3762,7 +3762,7 @@ lpfc_issue_els_rdf(struct lpfc_vport *vport, uint8_t retry)
 	memset(prdf, 0, cmdsize);
 	prdf->rdf.fpin_cmd = ELS_RDF;
 	prdf->rdf.desc_len = cpu_to_be32(sizeof(struct lpfc_els_rdf_req) -
-					 sizeof(struct fc_els_rdf));
+					 sizeof(struct fc_els_rdf_hdr));
 	prdf->reg_d1.reg_desc.desc_tag = cpu_to_be32(ELS_DTAG_FPIN_REGISTER);
 	prdf->reg_d1.reg_desc.desc_len = cpu_to_be32(
 				FC_TLV_DESC_LENGTH_FROM_SZ(prdf->reg_d1));
@@ -5339,12 +5339,12 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		ulp_status, ulp_word4, did);
 	/* ELS response tag <ulpIoTag> completes */
 	lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS,
-			 "0110 ELS response tag x%x completes "
+			 "0110 ELS response tag x%x completes fc_flag x%lx"
 			 "Data: x%x x%x x%x x%x x%lx x%x x%x x%x %p %p\n",
-			 iotag, ulp_status, ulp_word4, tmo,
+			 iotag, vport->fc_flag, ulp_status, ulp_word4, tmo,
 			 ndlp->nlp_DID, ndlp->nlp_flag, ndlp->nlp_state,
 			 ndlp->nlp_rpi, kref_read(&ndlp->kref), mbox, ndlp);
-	if (mbox) {
+	if (mbox && !test_bit(FC_PT2PT, &vport->fc_flag)) {
 		if (ulp_status == 0 &&
 		    test_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag)) {
 			if (!lpfc_unreg_rpi(vport, ndlp) &&
@@ -5403,6 +5403,10 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		}
 out_free_mbox:
 		lpfc_mbox_rsrc_cleanup(phba, mbox, MBOX_THD_UNLOCKED);
+	} else if (mbox && test_bit(FC_PT2PT, &vport->fc_flag) &&
+		   test_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag)) {
+		lpfc_mbx_cmpl_reg_login(phba, mbox);
+		clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag);
 	}
 out:
 	if (ndlp && shost) {
@@ -11259,6 +11263,11 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
 		lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
 			      "0126 FDISC cmpl status: x%x/x%x)\n",
 			      ulp_status, ulp_word4);
+
+		/* drop initial reference */
+		if (!test_and_set_bit(NLP_DROPPED, &ndlp->nlp_flag))
+			lpfc_nlp_put(ndlp);
+
 		goto fdisc_failed;
 	}
 
@@ -12008,7 +12017,11 @@ lpfc_sli4_els_xri_aborted(struct lpfc_hba *phba,
 			sglq_entry->state = SGL_FREED;
 			spin_unlock_irqrestore(&phba->sli4_hba.sgl_list_lock,
 					       iflag);
-
+			lpfc_printf_log(phba, KERN_INFO, LOG_ELS | LOG_SLI |
+					LOG_DISCOVERY | LOG_NODE,
+					"0732 ELS XRI ABORT on Node: ndlp=x%px "
+					"xri=x%x\n",
+					ndlp, xri);
 			if (ndlp) {
 				lpfc_set_rrq_active(phba, ndlp,
 					sglq_entry->sli4_lxritag,
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index 32298285ea5e..3bc0efa7453e 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term *
+ * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term *
  * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries.     *
  * Copyright (C) 2004-2016 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
@@ -366,6 +366,7 @@ struct lpfc_name {
 		} s;
 		uint8_t wwn[8];
 		uint64_t name __packed __aligned(4);
+		__be64 wwn_be __packed __aligned(4);
 	} u;
 };
 
diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h
index bc709786e6af..a7f7ed86d2b0 100644
--- a/drivers/scsi/lpfc/lpfc_hw4.h
+++ b/drivers/scsi/lpfc/lpfc_hw4.h
@@ -4909,18 +4909,18 @@ struct send_frame_wqe {
 
 #define ELS_RDF_REG_TAG_CNT		4
 struct lpfc_els_rdf_reg_desc {
-	struct fc_df_desc_fpin_reg	reg_desc;	/* descriptor header */
+	struct fc_df_desc_fpin_reg_hdr	reg_desc;	/* descriptor header */
 	__be32				desc_tags[ELS_RDF_REG_TAG_CNT];
 							/* tags in reg_desc */
 };
 
 struct lpfc_els_rdf_req {
-	struct fc_els_rdf		rdf;	   /* hdr up to descriptors */
+	struct fc_els_rdf_hdr		rdf;	   /* hdr up to descriptors */
 	struct lpfc_els_rdf_reg_desc	reg_d1;	/* 1st descriptor */
 };
 
 struct lpfc_els_rdf_rsp {
-	struct fc_els_rdf_resp		rdf_resp;  /* hdr up to descriptors */
+	struct fc_els_rdf_resp_hdr	rdf_resp;  /* hdr up to descriptors */
 	struct lpfc_els_rdf_reg_desc	reg_d1;	/* 1st descriptor */
 };
 
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 4081d2a358ee..0ca7429d86b8 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -3057,13 +3057,6 @@ lpfc_cleanup(struct lpfc_vport *vport)
 		lpfc_vmid_vport_cleanup(vport);
 
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
-		if (vport->port_type != LPFC_PHYSICAL_PORT &&
-		    ndlp->nlp_DID == Fabric_DID) {
-			/* Just free up ndlp with Fabric_DID for vports */
-			lpfc_nlp_put(ndlp);
-			continue;
-		}
-
 		if (ndlp->nlp_DID == Fabric_Cntl_DID &&
 		    ndlp->nlp_state == NLP_STE_UNUSED_NODE) {
 			lpfc_nlp_put(ndlp);
@@ -8300,10 +8293,7 @@ lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
 			phba->cfg_total_seg_cnt,  phba->cfg_scsi_seg_cnt,
 			phba->cfg_nvme_seg_cnt);
 
-	if (phba->cfg_sg_dma_buf_size < SLI4_PAGE_SIZE)
-		i = phba->cfg_sg_dma_buf_size;
-	else
-		i = SLI4_PAGE_SIZE;
+	i = min(phba->cfg_sg_dma_buf_size, SLI4_PAGE_SIZE);
 
 	phba->lpfc_sg_dma_buf_pool =
 			dma_pool_create("lpfc_sg_dma_buf_pool",
diff --git a/drivers/scsi/lpfc/lpfc_nportdisc.c b/drivers/scsi/lpfc/lpfc_nportdisc.c
index a596b80d03d4..1e5ef93e67e3 100644
--- a/drivers/scsi/lpfc/lpfc_nportdisc.c
+++ b/drivers/scsi/lpfc/lpfc_nportdisc.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2017-2024 Broadcom. All Rights Reserved. The term *
+ * Copyright (C) 2017-2025 Broadcom. All Rights Reserved. The term *
  * “Broadcom” refers to Broadcom Inc. and/or its subsidiaries.     *
  * Copyright (C) 2004-2016 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
@@ -326,8 +326,14 @@ lpfc_defer_plogi_acc(struct lpfc_hba *phba, LPFC_MBOXQ_t *login_mbox)
 		/* Now that REG_RPI completed successfully,
 		 * we can now proceed with sending the PLOGI ACC.
 		 */
-		rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI,
-				      save_iocb, ndlp, NULL);
+		if (test_bit(FC_PT2PT, &ndlp->vport->fc_flag)) {
+			rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI,
+					      save_iocb, ndlp, login_mbox);
+		} else {
+			rc = lpfc_els_rsp_acc(login_mbox->vport, ELS_CMD_PLOGI,
+					      save_iocb, ndlp, NULL);
+		}
+
 		if (rc) {
 			lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
 					"4576 PLOGI ACC fails pt2pt discovery: "
@@ -335,9 +341,16 @@ lpfc_defer_plogi_acc(struct lpfc_hba *phba, LPFC_MBOXQ_t *login_mbox)
 		}
 	}
 
-	/* Now process the REG_RPI cmpl */
-	lpfc_mbx_cmpl_reg_login(phba, login_mbox);
-	clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag);
+	/* If this is a fabric topology, complete the reg_rpi and prli now.
+	 * For Pt2Pt, the reg_rpi and PRLI are deferred until after the LS_ACC
+	 * completes.  This ensures, in Pt2Pt, that the PLOGI LS_ACC is sent
+	 * before the PRLI.
+	 */
+	if (!test_bit(FC_PT2PT, &ndlp->vport->fc_flag)) {
+		/* Now process the REG_RPI cmpl */
+		lpfc_mbx_cmpl_reg_login(phba, login_mbox);
+		clear_bit(NLP_ACC_REGLOGIN, &ndlp->nlp_flag);
+	}
 	kfree(save_iocb);
 }
 
diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c
index a6647dd360d1..e6f632521cff 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -1234,12 +1234,8 @@ lpfc_nvme_prep_io_cmd(struct lpfc_vport *vport,
 			if ((phba->cfg_nvme_enable_fb) &&
 			    test_bit(NLP_FIRSTBURST, &pnode->nlp_flag)) {
 				req_len = lpfc_ncmd->nvmeCmd->payload_length;
-				if (req_len < pnode->nvme_fb_size)
-					wqe->fcp_iwrite.initial_xfer_len =
-						req_len;
-				else
-					wqe->fcp_iwrite.initial_xfer_len =
-						pnode->nvme_fb_size;
+				wqe->fcp_iwrite.initial_xfer_len = min(req_len,
+								       pnode->nvme_fb_size);
 			} else {
 				wqe->fcp_iwrite.initial_xfer_len = 0;
 			}
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 508ceeecf2d9..6d9d8c196936 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -5935,7 +5935,7 @@ lpfc_chk_tgt_mapped(struct lpfc_vport *vport, struct fc_rport *rport)
 /**
  * lpfc_reset_flush_io_context -
  * @vport: The virtual port (scsi_host) for the flush context
- * @tgt_id: If aborting by Target contect - specifies the target id
+ * @tgt_id: If aborting by Target context - specifies the target id
  * @lun_id: If aborting by Lun context - specifies the lun id
  * @context: specifies the context level to flush at.
  *
@@ -6109,8 +6109,14 @@ lpfc_target_reset_handler(struct scsi_cmnd *cmnd)
 			pnode->nlp_fcp_info &= ~NLP_FCP_2_DEVICE;
 			spin_unlock_irqrestore(&pnode->lock, flags);
 		}
-		lpfc_reset_flush_io_context(vport, tgt_id, lun_id,
-					  LPFC_CTX_TGT);
+		status = lpfc_reset_flush_io_context(vport, tgt_id, lun_id,
+						     LPFC_CTX_TGT);
+		if (status != SUCCESS) {
+			lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP,
+					 "0726 Target Reset flush status x%x\n",
+					 status);
+			return status;
+		}
 		return FAST_IO_FAIL;
 	}
 
@@ -6202,7 +6208,7 @@ lpfc_host_reset_handler(struct scsi_cmnd *cmnd)
 	int rc, ret = SUCCESS;
 
 	lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP,
-			 "3172 SCSI layer issued Host Reset Data:\n");
+			 "3172 SCSI layer issued Host Reset\n");
 
 	lpfc_offline_prep(phba, LPFC_MBX_WAIT);
 	lpfc_offline(phba);
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index a8fbdf7119d8..7ea7c4245c69 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -8820,7 +8820,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
 	if (unlikely(rc)) {
 		lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
 				"0381 Error %d during queue setup.\n", rc);
-		goto out_stop_timers;
+		goto out_destroy_queue;
 	}
 	/* Initialize the driver internal SLI layer lists. */
 	lpfc_sli4_setup(phba);
@@ -9103,7 +9103,6 @@ out_free_iocblist:
 	lpfc_free_iocb_list(phba);
 out_destroy_queue:
 	lpfc_sli4_queue_destroy(phba);
-out_stop_timers:
 	lpfc_stop_hba_timers(phba);
 out_free_mbox:
 	mempool_free(mboxq, phba->mbox_mem_pool);
@@ -12439,19 +12438,11 @@ lpfc_sli_issue_abort_iotag(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 	}
 
 	/*
-	 * If we're unloading, don't abort iocb on the ELS ring, but change
-	 * the callback so that nothing happens when it finishes.
+	 * Always abort the outstanding WQE and set the IA bit correctly
+	 * for the context.  This is necessary for correctly removing
+	 * outstanding ndlp reference counts when the CQE completes with
+	 * the XB bit set.
 	 */
-	if (test_bit(FC_UNLOADING, &vport->load_flag) &&
-	    pring->ringno == LPFC_ELS_RING) {
-		if (cmdiocb->cmd_flag & LPFC_IO_FABRIC)
-			cmdiocb->fabric_cmd_cmpl = lpfc_ignore_els_cmpl;
-		else
-			cmdiocb->cmd_cmpl = lpfc_ignore_els_cmpl;
-		return retval;
-	}
-
-	/* issue ABTS for this IOCB based on iotag */
 	abtsiocbp = __lpfc_sli_get_iocbq(phba);
 	if (abtsiocbp == NULL)
 		return IOCB_NORESOURCE;
@@ -21373,7 +21364,7 @@ lpfc_sli4_issue_wqe(struct lpfc_hba *phba, struct lpfc_sli4_hdw_queue *qp,
 	struct lpfc_sglq *sglq;
 	struct lpfc_sli_ring *pring;
 	unsigned long iflags;
-	uint32_t ret = 0;
+	int ret = 0;
 
 	/* NVME_LS and NVME_LS ABTS requests. */
 	if (pwqe->cmd_flag & LPFC_IO_NVME_LS) {
diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h
index 9ee3a3a4ec4d..31c3c5abdca6 100644
--- a/drivers/scsi/lpfc/lpfc_version.h
+++ b/drivers/scsi/lpfc/lpfc_version.h
@@ -20,7 +20,7 @@
  * included with this package.                                     *
  *******************************************************************/
 
-#define LPFC_DRIVER_VERSION "14.4.0.10"
+#define LPFC_DRIVER_VERSION "14.4.0.11"
 #define LPFC_DRIVER_NAME		"lpfc"
 
 /* Used for SLI 2/3 */
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
index 96401eb7e231..8c8bfbbdd34e 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
@@ -322,6 +322,9 @@ struct mpi3_man6_gpio_entry {
 #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_MASK                       (0x01)
 #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_EDGE                       (0x00)
 #define MPI3_MAN6_GPIO_EXTINT_PARAM1_FLAGS_TRIGGER_LEVEL                      (0x01)
+#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_WARNING                         (0x00)
+#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_CRITICAL                        (0x01)
+#define MPI3_MAN6_GPIO_OVER_TEMP_PARAM1_LEVEL_FATAL                           (0x02)
 #define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ALL_UP                    (0x00)
 #define MPI3_MAN6_GPIO_PORT_GREEN_PARAM1_PHY_STATUS_ONE_OR_MORE_UP            (0x01)
 #define MPI3_MAN6_GPIO_CABLE_MGMT_PARAM1_INTERFACE_MODULE_PRESENT             (0x00)
@@ -1250,6 +1253,37 @@ struct mpi3_io_unit_page17 {
 	__le32                             current_key[];
 };
 #define MPI3_IOUNIT17_PAGEVERSION		(0x00)
+struct mpi3_io_unit_page18 {
+	struct mpi3_config_page_header		header;
+	u8					flags;
+	u8					poll_interval;
+	__le16					reserved0a;
+	__le32					reserved0c;
+};
+
+#define MPI3_IOUNIT18_PAGEVERSION                                   (0x00)
+#define MPI3_IOUNIT18_FLAGS_DIRECTATTACHED_ENABLE                   (0x01)
+#define MPI3_IOUNIT18_POLLINTERVAL_DISABLE                          (0x00)
+#ifndef MPI3_IOUNIT19_DEVICE_MAX
+#define MPI3_IOUNIT19_DEVICE_MAX                                    (1)
+#endif
+struct mpi3_iounit19_device {
+	__le16                             temperature;
+	__le16                             dev_handle;
+	__le16                             persistent_id;
+	__le16                             reserved06;
+};
+
+#define MPI3_IOUNIT19_DEVICE_TEMPERATURE_UNAVAILABLE                (0x8000)
+struct mpi3_io_unit_page19 {
+	struct mpi3_config_page_header		header;
+	__le16					num_devices;
+	__le16					reserved0a;
+	__le32					reserved0c;
+	struct mpi3_iounit19_device		device[MPI3_IOUNIT19_DEVICE_MAX];
+};
+
+#define MPI3_IOUNIT19_PAGEVERSION                                   (0x00)
 struct mpi3_ioc_page0 {
 	struct mpi3_config_page_header         header;
 	__le32                             reserved08;
@@ -2356,7 +2390,9 @@ struct mpi3_device0_vd_format {
 	__le16     io_throttle_group;
 	__le16     io_throttle_group_low;
 	__le16     io_throttle_group_high;
-	__le32     reserved0c;
+	u8         vd_abort_to;
+	u8         vd_reset_to;
+	__le16     reserved0e;
 };
 #define MPI3_DEVICE0_VD_STATE_OFFLINE                       (0x00)
 #define MPI3_DEVICE0_VD_STATE_PARTIALLY_DEGRADED            (0x01)
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h
index 7c15e5851ce4..4eeb11c3c73e 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_pci.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_pci.h
@@ -9,9 +9,11 @@
 #define MPI3_NVME_ENCAP_CMD_MAX               (1)
 #endif
 #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_MASK      (0x0002)
+#define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_SHIFT     (1)
 #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_FAIL_ONLY (0x0000)
 #define MPI3_NVME_FLAGS_FORCE_ADMIN_ERR_REPLY_ALL       (0x0002)
 #define MPI3_NVME_FLAGS_SUBMISSIONQ_MASK                (0x0001)
+#define MPI3_NVME_FLAGS_SUBMISSIONQ_SHIFT               (0)
 #define MPI3_NVME_FLAGS_SUBMISSIONQ_IO                  (0x0000)
 #define MPI3_NVME_FLAGS_SUBMISSIONQ_ADMIN               (0x0001)
 
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h
index 4a93c67d335f..190b06508b00 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_sas.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_sas.h
@@ -11,6 +11,7 @@
 #define MPI3_SAS_DEVICE_INFO_STP_INITIATOR          (0x00000010)
 #define MPI3_SAS_DEVICE_INFO_SMP_INITIATOR          (0x00000008)
 #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_MASK       (0x00000007)
+#define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_SHIFT      (0)
 #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_NO_DEVICE  (0x00000000)
 #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_END_DEVICE (0x00000001)
 #define MPI3_SAS_DEVICE_INFO_DEVICE_TYPE_EXPANDER   (0x00000002)
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
index 5c522e2531c3..28ab2efb3baa 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
@@ -18,7 +18,7 @@ union mpi3_version_union {
 
 #define MPI3_VERSION_MAJOR                                              (3)
 #define MPI3_VERSION_MINOR                                              (0)
-#define MPI3_VERSION_UNIT                                               (35)
+#define MPI3_VERSION_UNIT                                               (37)
 #define MPI3_VERSION_DEV                                                (0)
 #define MPI3_DEVHANDLE_INVALID                                          (0xffff)
 struct mpi3_sysif_oper_queue_indexes {
diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 8d4ef49e04d1..6742684e2990 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -56,8 +56,8 @@ extern struct list_head mrioc_list;
 extern int prot_mask;
 extern atomic64_t event_counter;
 
-#define MPI3MR_DRIVER_VERSION	"8.14.0.5.50"
-#define MPI3MR_DRIVER_RELDATE	"27-June-2025"
+#define MPI3MR_DRIVER_VERSION	"8.15.0.5.50"
+#define MPI3MR_DRIVER_RELDATE	"12-August-2025"
 
 #define MPI3MR_DRIVER_NAME	"mpi3mr"
 #define MPI3MR_DRIVER_LICENSE	"GPL"
@@ -697,6 +697,8 @@ struct tgt_dev_vd {
 	u16 tg_id;
 	u32 tg_high;
 	u32 tg_low;
+	u8 abort_to;
+	u8 reset_to;
 	struct mpi3mr_throttle_group_info *tg;
 };
 
@@ -738,6 +740,8 @@ enum mpi3mr_dev_state {
  * @wwid: World wide ID
  * @enclosure_logical_id: Enclosure logical identifier
  * @dev_spec: Device type specific information
+ * @abort_to: Timeout for abort TM
+ * @reset_to: Timeout for Target/LUN reset TM
  * @ref_count: Reference count
  * @state: device state
  */
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index 0152d31d430a..8fe6e0bf342e 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -2353,6 +2353,8 @@ static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc)
 {
 	int retval = 0;
 	u16 num_queues = 0, i = 0, msix_count_op_q = 1;
+	u32 ioc_status;
+	enum mpi3mr_iocstate ioc_state;
 
 	num_queues = min_t(int, mrioc->facts.max_op_reply_q,
 	    mrioc->facts.max_op_req_q);
@@ -2408,6 +2410,14 @@ static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc)
 		retval = -1;
 		goto out_failed;
 	}
+	ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+	ioc_state = mpi3mr_get_iocstate(mrioc);
+	if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
+	    ioc_state != MRIOC_STATE_READY) {
+		mpi3mr_print_fault_info(mrioc);
+		retval = -1;
+		goto out_failed;
+	}
 	mrioc->num_op_reply_q = mrioc->num_op_req_q = i;
 	ioc_info(mrioc,
 	    "successfully created %d operational queue pairs(default/polled) queue = (%d/%d)\n",
@@ -5420,6 +5430,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
 	    mpi3mr_reset_rc_name(reset_reason));
 
 	mrioc->device_refresh_on = 0;
+	scsi_block_requests(mrioc->shost);
 	mrioc->reset_in_progress = 1;
 	mrioc->stop_bsgs = 1;
 	mrioc->prev_reset_result = -1;
@@ -5528,6 +5539,7 @@ out:
 	if (!retval) {
 		mrioc->diagsave_timeout = 0;
 		mrioc->reset_in_progress = 0;
+		scsi_unblock_requests(mrioc->shost);
 		mrioc->pel_abort_requested = 0;
 		if (mrioc->pel_enabled) {
 			mrioc->pel_cmds.retry_count = 0;
@@ -5552,6 +5564,7 @@ out:
 		mrioc->device_refresh_on = 0;
 		mrioc->unrecoverable = 1;
 		mrioc->reset_in_progress = 0;
+		scsi_unblock_requests(mrioc->shost);
 		mrioc->stop_bsgs = 0;
 		retval = -1;
 		mpi3mr_flush_cmds_for_unrecovered_controller(mrioc);
diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c
index 3df52a3b435b..b88633e1efe2 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_os.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_os.c
@@ -1308,6 +1308,12 @@ static void mpi3mr_update_tgtdev(struct mpi3mr_ioc *mrioc,
 		if (vdinf->vd_state == MPI3_DEVICE0_VD_STATE_OFFLINE)
 			tgtdev->is_hidden = 1;
 		tgtdev->non_stl = 1;
+		tgtdev->dev_spec.vd_inf.reset_to =
+			max_t(u8, vdinf->vd_reset_to,
+			      MPI3MR_INTADMCMD_TIMEOUT);
+		tgtdev->dev_spec.vd_inf.abort_to =
+			max_t(u8, vdinf->vd_abort_to,
+			      MPI3MR_INTADMCMD_TIMEOUT);
 		tgtdev->dev_spec.vd_inf.tg_id = vdinf_io_throttle_group;
 		tgtdev->dev_spec.vd_inf.tg_high =
 		    le16_to_cpu(vdinf->io_throttle_group_high) * 2048;
@@ -2049,8 +2055,8 @@ static void mpi3mr_fwevt_bh(struct mpi3mr_ioc *mrioc,
 	if (!fwevt->process_evt)
 		goto evt_ack;
 
-	dprint_event_bh(mrioc, "processing event(0x%02x) in the bottom half handler\n",
-	    fwevt->event_id);
+	dprint_event_bh(mrioc, "processing event(0x%02x) -(0x%08x) in the bottom half handler\n",
+			fwevt->event_id, fwevt->evt_ctx);
 
 	switch (fwevt->event_id) {
 	case MPI3_EVENT_DEVICE_ADDED:
@@ -2866,12 +2872,14 @@ static void mpi3mr_preparereset_evt_th(struct mpi3mr_ioc *mrioc,
 		    "prepare for reset event top half with rc=start\n");
 		if (mrioc->prepare_for_reset)
 			return;
+		scsi_block_requests(mrioc->shost);
 		mrioc->prepare_for_reset = 1;
 		mrioc->prepare_for_reset_timeout_counter = 0;
 	} else if (evtdata->reason_code == MPI3_EVENT_PREPARE_RESET_RC_ABORT) {
 		dprint_event_th(mrioc,
 		    "prepare for reset top half with rc=abort\n");
 		mrioc->prepare_for_reset = 0;
+		scsi_unblock_requests(mrioc->shost);
 		mrioc->prepare_for_reset_timeout_counter = 0;
 	}
 	if ((event_reply->msg_flags & MPI3_EVENT_NOTIFY_MSGFLAGS_ACK_MASK)
@@ -3076,8 +3084,8 @@ void mpi3mr_os_handle_events(struct mpi3mr_ioc *mrioc,
 	}
 	if (process_evt_bh || ack_req) {
 		dprint_event_th(mrioc,
-			"scheduling bottom half handler for event(0x%02x),ack_required=%d\n",
-			evt_type, ack_req);
+		    "scheduling bottom half handler for event(0x%02x) - (0x%08x), ack_required=%d\n",
+		    evt_type, le32_to_cpu(event_reply->event_context), ack_req);
 		sz = event_reply->event_data_length * 4;
 		fwevt = mpi3mr_alloc_fwevt(sz);
 		if (!fwevt) {
@@ -3915,11 +3923,13 @@ int mpi3mr_issue_tm(struct mpi3mr_ioc *mrioc, u8 tm_type,
 	if (scsi_tgt_priv_data)
 		atomic_inc(&scsi_tgt_priv_data->block_io);
 
-	if (tgtdev && (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE)) {
-		if (cmd_priv && tgtdev->dev_spec.pcie_inf.abort_to)
-			timeout = tgtdev->dev_spec.pcie_inf.abort_to;
-		else if (!cmd_priv && tgtdev->dev_spec.pcie_inf.reset_to)
-			timeout = tgtdev->dev_spec.pcie_inf.reset_to;
+	if (tgtdev) {
+		if (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_PCIE)
+			timeout = cmd_priv ? tgtdev->dev_spec.pcie_inf.abort_to
+					   : tgtdev->dev_spec.pcie_inf.reset_to;
+		else if (tgtdev->dev_type == MPI3_DEVICE_DEVFORM_VD)
+			timeout = cmd_priv ? tgtdev->dev_spec.vd_inf.abort_to
+					   : tgtdev->dev_spec.vd_inf.reset_to;
 	}
 
 	init_completion(&drv_cmd->done);
diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
index c8d6ced5640e..d70f002d6487 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_transport.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
@@ -413,9 +413,11 @@ static void mpi3mr_remove_device_by_sas_address(struct mpi3mr_ioc *mrioc,
 			 sas_address, hba_port);
 	if (tgtdev) {
 		if (!list_empty(&tgtdev->list)) {
-			list_del_init(&tgtdev->list);
 			was_on_tgtdev_list = 1;
-			mpi3mr_tgtdev_put(tgtdev);
+			if (tgtdev->state == MPI3MR_DEV_REMOVE_HS_STARTED) {
+				list_del_init(&tgtdev->list);
+				mpi3mr_tgtdev_put(tgtdev);
+			}
 		}
 	}
 	spin_unlock_irqrestore(&mrioc->tgtdev_lock, flags);
@@ -2079,6 +2081,8 @@ int mpi3mr_expander_add(struct mpi3mr_ioc *mrioc, u16 handle)
 				link_rate = (expander_pg1.negotiated_link_rate &
 				    MPI3_SAS_NEG_LINK_RATE_LOGICAL_MASK) >>
 				    MPI3_SAS_NEG_LINK_RATE_LOGICAL_SHIFT;
+				if (link_rate < MPI3_SAS_NEG_LINK_RATE_1_5)
+					link_rate = MPI3_SAS_NEG_LINK_RATE_1_5;
 				mpi3mr_update_links(mrioc, sas_address_parent,
 				    handle, i, link_rate, hba_port);
 			}
@@ -2388,6 +2392,9 @@ int mpi3mr_report_tgtdev_to_sas_transport(struct mpi3mr_ioc *mrioc,
 
 	link_rate = mpi3mr_get_sas_negotiated_logical_linkrate(mrioc, tgtdev);
 
+	if (link_rate < MPI3_SAS_NEG_LINK_RATE_1_5)
+		link_rate = MPI3_SAS_NEG_LINK_RATE_1_5;
+
 	mpi3mr_update_links(mrioc, sas_address_parent, tgtdev->dev_handle,
 	    parent_phy_number, link_rate, hba_port);
 
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index bd3efa5b46c7..0d652db8fe24 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -1420,7 +1420,13 @@ _base_display_reply_info(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 
 	if (ioc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE) {
 		loginfo = le32_to_cpu(mpi_reply->IOCLogInfo);
-		_base_sas_log_info(ioc, loginfo);
+		if (ioc->logging_level & MPT_DEBUG_REPLY)
+			_base_sas_log_info(ioc, loginfo);
+		else {
+			if (!((ioc_status & MPI2_IOCSTATUS_MASK) &
+			MPI2_IOCSTATUS_CONFIG_INVALID_PAGE))
+				_base_sas_log_info(ioc, loginfo);
+		}
 	}
 
 	if (ioc_status || loginfo) {
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 939141cde3ca..e6a6f21d309b 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -77,8 +77,8 @@
 #define MPT3SAS_DRIVER_NAME		"mpt3sas"
 #define MPT3SAS_AUTHOR "Avago Technologies <MPT-FusionLinux.pdl@avagotech.com>"
 #define MPT3SAS_DESCRIPTION	"LSI MPT Fusion SAS 3.0 Device Driver"
-#define MPT3SAS_DRIVER_VERSION		"52.100.00.00"
-#define MPT3SAS_MAJOR_VERSION		52
+#define MPT3SAS_DRIVER_VERSION		"54.100.00.00"
+#define MPT3SAS_MAJOR_VERSION		54
 #define MPT3SAS_MINOR_VERSION		100
 #define MPT3SAS_BUILD_VERSION		00
 #define MPT3SAS_RELEASE_VERSION		00
diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c
index dc74ebc6405a..f3400d01cc2a 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c
@@ -166,6 +166,9 @@ _transport_convert_phy_link_rate(u8 link_rate)
 	case MPI25_SAS_NEG_LINK_RATE_12_0:
 		rc = SAS_LINK_RATE_12_0_GBPS;
 		break;
+	case MPI26_SAS_NEG_LINK_RATE_22_5:
+		rc = SAS_LINK_RATE_22_5_GBPS;
+		break;
 	case MPI2_SAS_NEG_LINK_RATE_PHY_DISABLED:
 		rc = SAS_PHY_DISABLED;
 		break;
@@ -987,11 +990,9 @@ mpt3sas_transport_port_remove(struct MPT3SAS_ADAPTER *ioc, u64 sas_address,
 	list_for_each_entry_safe(mpt3sas_phy, next_phy,
 	    &mpt3sas_port->phy_list, port_siblings) {
 		if ((ioc->logging_level & MPT_DEBUG_TRANSPORT))
-			dev_printk(KERN_INFO, &mpt3sas_port->port->dev,
-			    "remove: sas_addr(0x%016llx), phy(%d)\n",
-			    (unsigned long long)
-			    mpt3sas_port->remote_identify.sas_address,
-			    mpt3sas_phy->phy_id);
+			ioc_info(ioc, "remove: sas_addr(0x%016llx), phy(%d)\n",
+				(unsigned long long) mpt3sas_port->remote_identify.sas_address,
+					mpt3sas_phy->phy_id);
 		mpt3sas_phy->phy_belongs_to_port = 0;
 		if (!ioc->remove_host)
 			sas_port_delete_phy(mpt3sas_port->port,
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index 15b3d9d55a4b..f2e7997d5b9d 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -1175,7 +1175,7 @@ static int mvs_dev_found_notify(struct domain_device *dev, int lock)
 	mvi_device->dev_type = dev->dev_type;
 	mvi_device->mvi_info = mvi;
 	mvi_device->sas_device = dev;
-	if (parent_dev && dev_is_expander(parent_dev->dev_type)) {
+	if (dev_parent_is_expander(dev)) {
 		int phy_id;
 
 		phy_id = sas_find_attached_phy_id(&parent_dev->ex_dev, dev);
diff --git a/drivers/scsi/myrs.c b/drivers/scsi/myrs.c
index 95af3bb03834..a58abd796603 100644
--- a/drivers/scsi/myrs.c
+++ b/drivers/scsi/myrs.c
@@ -498,14 +498,14 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs,
 	/* Temporary dma mapping, used only in the scope of this function */
 	mbox = dma_alloc_coherent(&pdev->dev, sizeof(union myrs_cmd_mbox),
 				  &mbox_addr, GFP_KERNEL);
-	if (dma_mapping_error(&pdev->dev, mbox_addr))
+	if (!mbox)
 		return false;
 
 	/* These are the base addresses for the command memory mailbox array */
 	cs->cmd_mbox_size = MYRS_MAX_CMD_MBOX * sizeof(union myrs_cmd_mbox);
 	cmd_mbox = dma_alloc_coherent(&pdev->dev, cs->cmd_mbox_size,
 				      &cs->cmd_mbox_addr, GFP_KERNEL);
-	if (dma_mapping_error(&pdev->dev, cs->cmd_mbox_addr)) {
+	if (!cmd_mbox) {
 		dev_err(&pdev->dev, "Failed to map command mailbox\n");
 		goto out_free;
 	}
@@ -520,7 +520,7 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs,
 	cs->stat_mbox_size = MYRS_MAX_STAT_MBOX * sizeof(struct myrs_stat_mbox);
 	stat_mbox = dma_alloc_coherent(&pdev->dev, cs->stat_mbox_size,
 				       &cs->stat_mbox_addr, GFP_KERNEL);
-	if (dma_mapping_error(&pdev->dev, cs->stat_mbox_addr)) {
+	if (!stat_mbox) {
 		dev_err(&pdev->dev, "Failed to map status mailbox\n");
 		goto out_free;
 	}
@@ -533,7 +533,7 @@ static bool myrs_enable_mmio_mbox(struct myrs_hba *cs,
 	cs->fwstat_buf = dma_alloc_coherent(&pdev->dev,
 					    sizeof(struct myrs_fwstat),
 					    &cs->fwstat_addr, GFP_KERNEL);
-	if (dma_mapping_error(&pdev->dev, cs->fwstat_addr)) {
+	if (!cs->fwstat_buf) {
 		dev_err(&pdev->dev, "Failed to map firmware health buffer\n");
 		cs->fwstat_buf = NULL;
 		goto out_free;
diff --git a/drivers/scsi/pm8001/pm8001_ctl.c b/drivers/scsi/pm8001/pm8001_ctl.c
index 7618f9cc9986..cbfda8c04e95 100644
--- a/drivers/scsi/pm8001/pm8001_ctl.c
+++ b/drivers/scsi/pm8001/pm8001_ctl.c
@@ -534,23 +534,25 @@ static ssize_t pm8001_ctl_iop_log_show(struct device *cdev,
 	char *str = buf;
 	u32 read_size =
 		pm8001_ha->main_cfg_tbl.pm80xx_tbl.event_log_size / 1024;
-	static u32 start, end, count;
 	u32 max_read_times = 32;
 	u32 max_count = (read_size * 1024) / (max_read_times * 4);
 	u32 *temp = (u32 *)pm8001_ha->memoryMap.region[IOP].virt_ptr;
 
-	if ((count % max_count) == 0) {
-		start = 0;
-		end = max_read_times;
-		count = 0;
+	mutex_lock(&pm8001_ha->iop_log_lock);
+
+	if ((pm8001_ha->iop_log_count % max_count) == 0) {
+		pm8001_ha->iop_log_start = 0;
+		pm8001_ha->iop_log_end = max_read_times;
+		pm8001_ha->iop_log_count = 0;
 	} else {
-		start = end;
-		end = end + max_read_times;
+		pm8001_ha->iop_log_start = pm8001_ha->iop_log_end;
+		pm8001_ha->iop_log_end = pm8001_ha->iop_log_end + max_read_times;
 	}
 
-	for (; start < end; start++)
-		str += sprintf(str, "%08x ", *(temp+start));
-	count++;
+	for (; pm8001_ha->iop_log_start < pm8001_ha->iop_log_end; pm8001_ha->iop_log_start++)
+		str += sprintf(str, "%08x ", *(temp+pm8001_ha->iop_log_start));
+	pm8001_ha->iop_log_count++;
+	mutex_unlock(&pm8001_ha->iop_log_lock);
 	return str - buf;
 }
 static DEVICE_ATTR(iop_log, S_IRUGO, pm8001_ctl_iop_log_show, NULL);
@@ -680,7 +682,7 @@ static int pm8001_set_nvmd(struct pm8001_hba_info *pm8001_ha)
 	struct pm8001_ioctl_payload	*payload;
 	DECLARE_COMPLETION_ONSTACK(completion);
 	u8		*ioctlbuffer;
-	u32		ret;
+	int		ret;
 	u32		length = 1024 * 5 + sizeof(*payload) - 1;
 
 	if (pm8001_ha->fw_image->size > 4096) {
diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c
index 42a4eeac24c9..8005995a317c 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.c
+++ b/drivers/scsi/pm8001/pm8001_hwi.c
@@ -2163,8 +2163,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
 	/* Print sas address of IO failed device */
 	if ((status != IO_SUCCESS) && (status != IO_OVERFLOW) &&
 		(status != IO_UNDERFLOW)) {
-		if (!((t->dev->parent) &&
-			(dev_is_expander(t->dev->parent->dev_type)))) {
+		if (!dev_parent_is_expander(t->dev)) {
 			for (i = 0, j = 4; j <= 7 && i <= 3; i++, j++)
 				sata_addr_low[i] = pm8001_ha->sas_addr[j];
 			for (i = 0, j = 0; j <= 3 && i <= 3; i++, j++)
@@ -4168,7 +4167,6 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha,
 	u16 firstBurstSize = 0;
 	u16 ITNT = 2000;
 	struct domain_device *dev = pm8001_dev->sas_device;
-	struct domain_device *parent_dev = dev->parent;
 	struct pm8001_port *port = dev->port->lldd_port;
 
 	memset(&payload, 0, sizeof(payload));
@@ -4186,10 +4184,9 @@ static int pm8001_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha,
 			dev_is_expander(pm8001_dev->dev_type))
 			stp_sspsmp_sata = 0x01; /*ssp or smp*/
 	}
-	if (parent_dev && dev_is_expander(parent_dev->dev_type))
-		phy_id = parent_dev->ex_dev.ex_phy->phy_id;
-	else
-		phy_id = pm8001_dev->attached_phy;
+
+	phy_id = pm80xx_get_local_phy_id(dev);
+
 	opc = OPC_INB_REG_DEV;
 	linkrate = (pm8001_dev->sas_device->linkrate < dev->port->linkrate) ?
 			pm8001_dev->sas_device->linkrate : dev->port->linkrate;
diff --git a/drivers/scsi/pm8001/pm8001_hwi.h b/drivers/scsi/pm8001/pm8001_hwi.h
index fc2127dcb58d..f1ce8df082b0 100644
--- a/drivers/scsi/pm8001/pm8001_hwi.h
+++ b/drivers/scsi/pm8001/pm8001_hwi.h
@@ -339,8 +339,10 @@ struct ssp_completion_resp {
 	__le32	status;
 	__le32	param;
 	__le32	ssptag_rescv_rescpad;
+
+	/* Must be last --ends in a flexible-array member. */
 	struct ssp_response_iu  ssp_resp_iu;
-	__le32	residual_count;
+	/* __le32  residual_count; */
 } __attribute__((packed, aligned(4)));
 
 
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 599410bcdfea..8ff4b89ff81e 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -552,6 +552,7 @@ static struct pm8001_hba_info *pm8001_pci_alloc(struct pci_dev *pdev,
 	pm8001_ha->id = pm8001_id++;
 	pm8001_ha->logging_level = logging_level;
 	pm8001_ha->non_fatal_count = 0;
+	mutex_init(&pm8001_ha->iop_log_lock);
 	if (link_rate >= 1 && link_rate <= 15)
 		pm8001_ha->link_rate = (link_rate << 8);
 	else {
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index f7067878b34f..6a8d35aea93a 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -130,6 +130,16 @@ static void pm80xx_get_tag_opcodes(struct sas_task *task, int *ata_op,
 	}
 }
 
+u32 pm80xx_get_local_phy_id(struct domain_device *dev)
+{
+	struct pm8001_device *pm8001_dev = dev->lldd_dev;
+
+	if (dev_parent_is_expander(dev))
+		return dev->parent->ex_dev.ex_phy->phy_id;
+
+	return pm8001_dev->attached_phy;
+}
+
 void pm80xx_show_pending_commands(struct pm8001_hba_info *pm8001_ha,
 				  struct pm8001_device *target_pm8001_dev)
 {
@@ -477,7 +487,7 @@ int pm8001_queue_command(struct sas_task *task, gfp_t gfp_flags)
 	struct pm8001_device *pm8001_dev = dev->lldd_dev;
 	bool internal_abort = sas_is_internal_abort(task);
 	struct pm8001_hba_info *pm8001_ha;
-	struct pm8001_port *port = NULL;
+	struct pm8001_port *port;
 	struct pm8001_ccb_info *ccb;
 	unsigned long flags;
 	u32 n_elem = 0;
@@ -502,8 +512,7 @@ int pm8001_queue_command(struct sas_task *task, gfp_t gfp_flags)
 
 	spin_lock_irqsave(&pm8001_ha->lock, flags);
 
-	pm8001_dev = dev->lldd_dev;
-	port = pm8001_ha->phy[pm8001_dev->attached_phy].port;
+	port = dev->port->lldd_port;
 
 	if (!internal_abort &&
 	    (DEV_IS_GONE(pm8001_dev) || !port || !port->port_attached)) {
@@ -701,7 +710,7 @@ static int pm8001_dev_found_notify(struct domain_device *dev)
 	dev->lldd_dev = pm8001_device;
 	pm8001_device->dev_type = dev->dev_type;
 	pm8001_device->dcompletion = &completion;
-	if (parent_dev && dev_is_expander(parent_dev->dev_type)) {
+	if (dev_parent_is_expander(dev)) {
 		int phy_id;
 
 		phy_id = sas_find_attached_phy_id(&parent_dev->ex_dev, dev);
@@ -766,7 +775,16 @@ static void pm8001_dev_gone_notify(struct domain_device *dev)
 			spin_lock_irqsave(&pm8001_ha->lock, flags);
 		}
 		PM8001_CHIP_DISP->dereg_dev_req(pm8001_ha, device_id);
-		pm8001_ha->phy[pm8001_dev->attached_phy].phy_attached = 0;
+
+		/*
+		 * The phy array only contains local phys. Thus, we cannot clear
+		 * phy_attached for a device behind an expander.
+		 */
+		if (!dev_parent_is_expander(dev)) {
+			u32 phy_id = pm80xx_get_local_phy_id(dev);
+
+			pm8001_ha->phy[phy_id].phy_attached = 0;
+		}
 		pm8001_free_dev(pm8001_dev);
 	} else {
 		pm8001_dbg(pm8001_ha, DISC, "Found dev has gone.\n");
@@ -1048,7 +1066,7 @@ int pm8001_abort_task(struct sas_task *task)
 	struct pm8001_hba_info *pm8001_ha;
 	struct pm8001_device *pm8001_dev;
 	int rc = TMF_RESP_FUNC_FAILED, ret;
-	u32 phy_id, port_id;
+	u32 port_id;
 	struct sas_task_slow slow_task;
 
 	if (!task->lldd_task || !task->dev)
@@ -1057,7 +1075,6 @@ int pm8001_abort_task(struct sas_task *task)
 	dev = task->dev;
 	pm8001_dev = dev->lldd_dev;
 	pm8001_ha = pm8001_find_ha_by_dev(dev);
-	phy_id = pm8001_dev->attached_phy;
 
 	if (PM8001_CHIP_DISP->fatal_errors(pm8001_ha)) {
 		// If the controller is seeing fatal errors
@@ -1089,7 +1106,8 @@ int pm8001_abort_task(struct sas_task *task)
 		if (pm8001_ha->chip_id == chip_8006) {
 			DECLARE_COMPLETION_ONSTACK(completion_reset);
 			DECLARE_COMPLETION_ONSTACK(completion);
-			struct pm8001_phy *phy = pm8001_ha->phy + phy_id;
+			u32 phy_id = pm80xx_get_local_phy_id(dev);
+			struct pm8001_phy *phy = &pm8001_ha->phy[phy_id];
 			port_id = phy->port->port_id;
 
 			/* 1. Set Device state as Recovery */
diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h
index 334485bb2c12..b63b6ffcaaf5 100644
--- a/drivers/scsi/pm8001/pm8001_sas.h
+++ b/drivers/scsi/pm8001/pm8001_sas.h
@@ -547,6 +547,10 @@ struct pm8001_hba_info {
 	u32 ci_offset;
 	u32 pi_offset;
 	u32 max_memcnt;
+	u32 iop_log_start;
+	u32 iop_log_end;
+	u32 iop_log_count;
+	struct mutex iop_log_lock;
 };
 
 struct pm8001_work {
@@ -798,6 +802,7 @@ void pm8001_setds_completion(struct domain_device *dev);
 void pm8001_tmf_aborted(struct sas_task *task);
 void pm80xx_show_pending_commands(struct pm8001_hba_info *pm8001_ha,
 				  struct pm8001_device *dev);
+u32 pm80xx_get_local_phy_id(struct domain_device *dev);
 
 #endif
 
diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index c1bae995a412..31960b72c1e9 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -2340,8 +2340,7 @@ mpi_sata_completion(struct pm8001_hba_info *pm8001_ha,
 	/* Print sas address of IO failed device */
 	if ((status != IO_SUCCESS) && (status != IO_OVERFLOW) &&
 		(status != IO_UNDERFLOW)) {
-		if (!((t->dev->parent) &&
-			(dev_is_expander(t->dev->parent->dev_type)))) {
+		if (!dev_parent_is_expander(t->dev)) {
 			for (i = 0, j = 4; i <= 3 && j <= 7; i++, j++)
 				sata_addr_low[i] = pm8001_ha->sas_addr[j];
 			for (i = 0, j = 0; i <= 3 && j <= 3; i++, j++)
@@ -4780,7 +4779,6 @@ static int pm80xx_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha,
 	u16 firstBurstSize = 0;
 	u16 ITNT = 2000;
 	struct domain_device *dev = pm8001_dev->sas_device;
-	struct domain_device *parent_dev = dev->parent;
 	struct pm8001_port *port = dev->port->lldd_port;
 
 	memset(&payload, 0, sizeof(payload));
@@ -4799,10 +4797,8 @@ static int pm80xx_chip_reg_dev_req(struct pm8001_hba_info *pm8001_ha,
 			dev_is_expander(pm8001_dev->dev_type))
 			stp_sspsmp_sata = 0x01; /*ssp or smp*/
 	}
-	if (parent_dev && dev_is_expander(parent_dev->dev_type))
-		phy_id = parent_dev->ex_dev.ex_phy->phy_id;
-	else
-		phy_id = pm8001_dev->attached_phy;
+
+	phy_id = pm80xx_get_local_phy_id(dev);
 
 	opc = OPC_INB_REG_DEV;
 
diff --git a/drivers/scsi/pm8001/pm80xx_hwi.h b/drivers/scsi/pm8001/pm80xx_hwi.h
index eb8fd37b2066..d8a63b7fed6a 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.h
+++ b/drivers/scsi/pm8001/pm80xx_hwi.h
@@ -558,8 +558,10 @@ struct ssp_completion_resp {
 	__le32	status;
 	__le32	param;
 	__le32	ssptag_rescv_rescpad;
+
+	/* Must be last --ends in a flexible-array member. */
 	struct ssp_response_iu ssp_resp_iu;
-	__le32	residual_count;
+	/* __le32  residual_count; */
 } __attribute__((packed, aligned(4)));
 
 #define SSP_RESCV_BIT	0x00010000
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 10431a67d202..ccfc2d26dd37 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -3106,8 +3106,8 @@ static bool qla_bsg_found(struct qla_qpair *qpair, struct bsg_job *bsg_job)
 	switch (rval) {
 	case QLA_SUCCESS:
 		/* Wait for the command completion. */
-		ratov_j = ha->r_a_tov / 10 * 4 * 1000;
-		ratov_j = msecs_to_jiffies(ratov_j);
+		ratov_j = ha->r_a_tov / 10 * 4;
+		ratov_j = secs_to_jiffies(ratov_j);
 
 		if (!wait_for_completion_timeout(&comp, ratov_j)) {
 			ql_log(ql_log_info, vha, 0x7089,
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index cb95b7b12051..604e66bead1e 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -4890,9 +4890,7 @@ struct purex_item {
 			     struct purex_item *pkt);
 	atomic_t in_use;
 	uint16_t size;
-	struct {
-		uint8_t iocb[64];
-	} iocb;
+	uint8_t iocb[] __counted_by(size);
 };
 
 #include "qla_edif.h"
@@ -5101,7 +5099,6 @@ typedef struct scsi_qla_host {
 		struct list_head head;
 		spinlock_t lock;
 	} purex_list;
-	struct purex_item default_item;
 
 	struct name_list_extended gnl;
 	/* Count of active session/fcport */
@@ -5130,6 +5127,11 @@ typedef struct scsi_qla_host {
 #define DPORT_DIAG_IN_PROGRESS                 BIT_0
 #define DPORT_DIAG_CHIP_RESET_IN_PROGRESS      BIT_1
 	uint16_t dport_status;
+
+	/* Must be last --ends in a flexible-array member. */
+	TRAILING_OVERLAP(struct purex_item, default_item, iocb,
+		uint8_t __default_item_iocb[QLA_DEFAULT_PAYLOAD_SIZE];
+	);
 } scsi_qla_host_t;
 
 struct qla27xx_image_status {
diff --git a/drivers/scsi/qla2xxx/qla_edif.c b/drivers/scsi/qla2xxx/qla_edif.c
index 91bbd3b75bff..ccd4485087a1 100644
--- a/drivers/scsi/qla2xxx/qla_edif.c
+++ b/drivers/scsi/qla2xxx/qla_edif.c
@@ -1798,7 +1798,7 @@ retry:
 	switch (rval) {
 	case QLA_SUCCESS:
 		break;
-	case EAGAIN:
+	case -EAGAIN:
 		msleep(EDIF_MSLEEP_INTERVAL);
 		cnt++;
 		if (cnt < EDIF_RETRY_COUNT)
@@ -3649,7 +3649,7 @@ retry:
 		       p->e.extra_rx_xchg_address, p->e.extra_control_flags,
 		       sp->handle, sp->remap.req.len, bsg_job);
 		break;
-	case EAGAIN:
+	case -EAGAIN:
 		msleep(EDIF_MSLEEP_INTERVAL);
 		cnt++;
 		if (cnt < EDIF_RETRY_COUNT)
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index be211ff22acb..6a2e1c7fd125 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -2059,11 +2059,11 @@ static void qla_marker_sp_done(srb_t *sp, int res)
 	int cnt = 5; \
 	do { \
 		if (_chip_gen != sp->vha->hw->chip_reset || _login_gen != sp->fcport->login_gen) {\
-			_rval = EINVAL; \
+			_rval = -EINVAL; \
 			break; \
 		} \
 		_rval = qla2x00_start_sp(_sp); \
-		if (_rval == EAGAIN) \
+		if (_rval == -EAGAIN) \
 			msleep(1); \
 		else \
 			break; \
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index c4c6b5c6658c..4559b490614d 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -1077,17 +1077,17 @@ static struct purex_item *
 qla24xx_alloc_purex_item(scsi_qla_host_t *vha, uint16_t size)
 {
 	struct purex_item *item = NULL;
-	uint8_t item_hdr_size = sizeof(*item);
 
 	if (size > QLA_DEFAULT_PAYLOAD_SIZE) {
-		item = kzalloc(item_hdr_size +
-		    (size - QLA_DEFAULT_PAYLOAD_SIZE), GFP_ATOMIC);
+		item = kzalloc(struct_size(item, iocb, size), GFP_ATOMIC);
 	} else {
 		if (atomic_inc_return(&vha->default_item.in_use) == 1) {
 			item = &vha->default_item;
 			goto initialize_purex_header;
 		} else {
-			item = kzalloc(item_hdr_size, GFP_ATOMIC);
+			item = kzalloc(
+				struct_size(item, iocb, QLA_DEFAULT_PAYLOAD_SIZE),
+				GFP_ATOMIC);
 		}
 	}
 	if (!item) {
@@ -1127,17 +1127,16 @@ qla24xx_queue_purex_item(scsi_qla_host_t *vha, struct purex_item *pkt,
  * @vha: SCSI driver HA context
  * @pkt: ELS packet
  */
-static struct purex_item
-*qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt)
+static struct purex_item *
+qla24xx_copy_std_pkt(struct scsi_qla_host *vha, void *pkt)
 {
 	struct purex_item *item;
 
-	item = qla24xx_alloc_purex_item(vha,
-					QLA_DEFAULT_PAYLOAD_SIZE);
+	item = qla24xx_alloc_purex_item(vha, QLA_DEFAULT_PAYLOAD_SIZE);
 	if (!item)
 		return item;
 
-	memcpy(&item->iocb, pkt, sizeof(item->iocb));
+	memcpy(&item->iocb, pkt, QLA_DEFAULT_PAYLOAD_SIZE);
 	return item;
 }
 
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 8ee2e337c9e1..065f9bcca26f 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -419,7 +419,7 @@ retry:
 	switch (rval) {
 	case QLA_SUCCESS:
 		break;
-	case EAGAIN:
+	case -EAGAIN:
 		msleep(PURLS_MSLEEP_INTERVAL);
 		cnt++;
 		if (cnt < PURLS_RETRY_COUNT)
@@ -1308,7 +1308,7 @@ void qla2xxx_process_purls_iocb(void **pkt, struct rsp_que **rsp)
 
 	ql_dbg(ql_dbg_unsol, vha, 0x2121,
 	       "PURLS OP[%01x] size %d xchg addr 0x%x portid %06x\n",
-	       item->iocb.iocb[3], item->size, uctx->exchange_address,
+	       item->iocb[3], item->size, uctx->exchange_address,
 	       fcport->d_id.b24);
 	/* +48    0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
 	 * ----- -----------------------------------------------
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index d4b484c0fd9d..98a5c105fdfd 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1291,8 +1291,8 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
 	       "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval);
 
 	/* Wait for the command completion. */
-	ratov_j = ha->r_a_tov/10 * 4 * 1000;
-	ratov_j = msecs_to_jiffies(ratov_j);
+	ratov_j = ha->r_a_tov / 10 * 4;
+	ratov_j = secs_to_jiffies(ratov_j);
 	switch (rval) {
 	case QLA_SUCCESS:
 		if (!wait_for_completion_timeout(&comp, ratov_j)) {
@@ -1806,8 +1806,8 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
 		rval = ha->isp_ops->abort_command(sp);
 		/* Wait for command completion. */
 		ret_cmd = false;
-		ratov_j = ha->r_a_tov/10 * 4 * 1000;
-		ratov_j = msecs_to_jiffies(ratov_j);
+		ratov_j = ha->r_a_tov / 10 * 4;
+		ratov_j = secs_to_jiffies(ratov_j);
 		switch (rval) {
 		case QLA_SUCCESS:
 			if (wait_for_completion_timeout(&comp, ratov_j)) {
@@ -6459,9 +6459,10 @@ dealloc:
 void
 qla24xx_free_purex_item(struct purex_item *item)
 {
-	if (item == &item->vha->default_item)
+	if (item == &item->vha->default_item) {
 		memset(&item->vha->default_item, 0, sizeof(struct purex_item));
-	else
+		memset(&item->vha->__default_item_iocb, 0, QLA_DEFAULT_PAYLOAD_SIZE);
+	} else
 		kfree(item);
 }
 
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 353cb60e1abe..b2ab97be5db3 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1155,14 +1155,9 @@ static ssize_t sdebug_error_write(struct file *file, const char __user *ubuf,
 	struct sdebug_err_inject *inject;
 	struct scsi_device *sdev = (struct scsi_device *)file->f_inode->i_private;
 
-	buf = kzalloc(count + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, ubuf, count)) {
-		kfree(buf);
-		return -EFAULT;
-	}
+	buf = memdup_user_nul(ubuf, count);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
 
 	if (buf[0] == '-')
 		return sdebug_err_remove(sdev, buf, count);
@@ -8805,8 +8800,8 @@ static int sdebug_add_store(void)
 	/* Logical Block Provisioning */
 	if (scsi_debug_lbp()) {
 		map_size = lba_to_map_index(sdebug_store_sectors - 1) + 1;
-		sip->map_storep = vmalloc(array_size(sizeof(long),
-						     BITS_TO_LONGS(map_size)));
+		sip->map_storep = vcalloc(BITS_TO_LONGS(map_size),
+					  sizeof(long));
 
 		pr_info("%lu provisioning blocks\n", map_size);
 
@@ -8815,8 +8810,6 @@ static int sdebug_add_store(void)
 			goto err;
 		}
 
-		bitmap_zero(sip->map_storep, map_size);
-
 		/* Map first 1KB for partition table */
 		if (sdebug_num_parts)
 			map_region(sip, 0, 2);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 00ad574ce61c..0252d3f6bed1 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -106,7 +106,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim,
 		unsigned int mode);
 static void sd_config_write_same(struct scsi_disk *sdkp,
 		struct queue_limits *lim);
-static int  sd_revalidate_disk(struct gendisk *);
+static void  sd_revalidate_disk(struct gendisk *);
 static void sd_unlock_native_capacity(struct gendisk *disk);
 static void sd_shutdown(struct device *);
 static void scsi_disk_release(struct device *cdev);
@@ -3691,13 +3691,13 @@ static void sd_read_block_zero(struct scsi_disk *sdkp)
  *	performs disk spin up, read_capacity, etc.
  *	@disk: struct gendisk we care about
  **/
-static int sd_revalidate_disk(struct gendisk *disk)
+static void sd_revalidate_disk(struct gendisk *disk)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdp = sdkp->device;
 	sector_t old_capacity = sdkp->capacity;
-	struct queue_limits lim;
-	unsigned char *buffer;
+	struct queue_limits *lim = NULL;
+	unsigned char *buffer = NULL;
 	unsigned int dev_max;
 	int err;
 
@@ -3709,25 +3709,26 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	 * of the other niceties.
 	 */
 	if (!scsi_device_online(sdp))
-		goto out;
+		return;
+
+	lim = kmalloc(sizeof(*lim), GFP_KERNEL);
+	if (!lim)
+		return;
 
 	buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL);
-	if (!buffer) {
-		sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "
-			  "allocation failure.\n");
+	if (!buffer)
 		goto out;
-	}
 
 	sd_spinup_disk(sdkp);
 
-	lim = queue_limits_start_update(sdkp->disk->queue);
+	*lim = queue_limits_start_update(sdkp->disk->queue);
 
 	/*
 	 * Without media there is no reason to ask; moreover, some devices
 	 * react badly if we do.
 	 */
 	if (sdkp->media_present) {
-		sd_read_capacity(sdkp, &lim, buffer);
+		sd_read_capacity(sdkp, lim, buffer);
 		/*
 		 * Some USB/UAS devices return generic values for mode pages
 		 * until the media has been accessed. Trigger a READ operation
@@ -3741,17 +3742,17 @@ static int sd_revalidate_disk(struct gendisk *disk)
 		 * cause this to be updated correctly and any device which
 		 * doesn't support it should be treated as rotational.
 		 */
-		lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM);
+		lim->features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM);
 
 		if (scsi_device_supports_vpd(sdp)) {
 			sd_read_block_provisioning(sdkp);
-			sd_read_block_limits(sdkp, &lim);
+			sd_read_block_limits(sdkp, lim);
 			sd_read_block_limits_ext(sdkp);
-			sd_read_block_characteristics(sdkp, &lim);
-			sd_zbc_read_zones(sdkp, &lim, buffer);
+			sd_read_block_characteristics(sdkp, lim);
+			sd_zbc_read_zones(sdkp, lim, buffer);
 		}
 
-		sd_config_discard(sdkp, &lim, sd_discard_mode(sdkp));
+		sd_config_discard(sdkp, lim, sd_discard_mode(sdkp));
 
 		sd_print_capacity(sdkp, old_capacity);
 
@@ -3761,47 +3762,46 @@ static int sd_revalidate_disk(struct gendisk *disk)
 		sd_read_app_tag_own(sdkp, buffer);
 		sd_read_write_same(sdkp, buffer);
 		sd_read_security(sdkp, buffer);
-		sd_config_protection(sdkp, &lim);
+		sd_config_protection(sdkp, lim);
 	}
 
 	/*
 	 * We now have all cache related info, determine how we deal
 	 * with flush requests.
 	 */
-	sd_set_flush_flag(sdkp, &lim);
+	sd_set_flush_flag(sdkp, lim);
 
 	/* Initial block count limit based on CDB TRANSFER LENGTH field size. */
 	dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS;
 
 	/* Some devices report a maximum block count for READ/WRITE requests. */
 	dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks);
-	lim.max_dev_sectors = logical_to_sectors(sdp, dev_max);
+	lim->max_dev_sectors = logical_to_sectors(sdp, dev_max);
 
 	if (sd_validate_min_xfer_size(sdkp))
-		lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks);
+		lim->io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks);
 	else
-		lim.io_min = 0;
+		lim->io_min = 0;
 
 	/*
 	 * Limit default to SCSI host optimal sector limit if set. There may be
 	 * an impact on performance for when the size of a request exceeds this
 	 * host limit.
 	 */
-	lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT;
+	lim->io_opt = sdp->host->opt_sectors << SECTOR_SHIFT;
 	if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
-		lim.io_opt = min_not_zero(lim.io_opt,
+		lim->io_opt = min_not_zero(lim->io_opt,
 				logical_to_bytes(sdp, sdkp->opt_xfer_blocks));
 	}
 
 	sdkp->first_scan = 0;
 
 	set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity));
-	sd_config_write_same(sdkp, &lim);
-	kfree(buffer);
+	sd_config_write_same(sdkp, lim);
 
-	err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim);
+	err = queue_limits_commit_update_frozen(sdkp->disk->queue, lim);
 	if (err)
-		return err;
+		goto out;
 
 	/*
 	 * Query concurrent positioning ranges after
@@ -3820,7 +3820,9 @@ static int sd_revalidate_disk(struct gendisk *disk)
 		set_capacity_and_notify(disk, 0);
 
  out:
-	return 0;
+	kfree(buffer);
+	kfree(lim);
+
 }
 
 /**
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index 125944941601..03c97e60d36f 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -20,6 +20,7 @@
 #include <linux/reboot.h>
 #include <linux/cciss_ioctl.h>
 #include <linux/crash_dump.h>
+#include <linux/string.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
@@ -6774,17 +6775,15 @@ static int pqi_passthru_ioctl(struct pqi_ctrl_info *ctrl_info, void __user *arg)
 	}
 
 	if (iocommand.buf_size > 0) {
-		kernel_buffer = kmalloc(iocommand.buf_size, GFP_KERNEL);
-		if (!kernel_buffer)
-			return -ENOMEM;
 		if (iocommand.Request.Type.Direction & XFER_WRITE) {
-			if (copy_from_user(kernel_buffer, iocommand.buf,
-				iocommand.buf_size)) {
-				rc = -EFAULT;
-				goto out;
-			}
+			kernel_buffer = memdup_user(iocommand.buf,
+						    iocommand.buf_size);
+			if (IS_ERR(kernel_buffer))
+				return PTR_ERR(kernel_buffer);
 		} else {
-			memset(kernel_buffer, 0, iocommand.buf_size);
+			kernel_buffer = kzalloc(iocommand.buf_size, GFP_KERNEL);
+			if (!kernel_buffer)
+				return -ENOMEM;
 		}
 	}
 
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index dc51ea352198..567f9cd29102 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1941,8 +1941,8 @@ static int storvsc_probe(struct hv_device *device,
 	int num_present_cpus = num_present_cpus();
 	struct Scsi_Host *host;
 	struct hv_host_device *host_dev;
-	bool dev_is_ide = ((dev_id->driver_data == IDE_GUID) ? true : false);
-	bool is_fc = ((dev_id->driver_data == SFC_GUID) ? true : false);
+	bool dev_is_ide = dev_id->driver_data == IDE_GUID;
+	bool is_fc = dev_id->driver_data == SFC_GUID;
 	int target = 0;
 	struct storvsc_device *stor_device;
 	int max_sub_channels = 0;
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 88db94f382bb..efe8cdb20060 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -665,7 +665,7 @@ static ssize_t lio_target_nacl_cmdsn_depth_store(struct config_item *item,
 	}
 	acl_ci = &se_nacl->acl_group.cg_item;
 	if (!acl_ci) {
-		pr_err("Unable to locatel acl_ci\n");
+		pr_err("Unable to locate acl_ci\n");
 		return -EINVAL;
 	}
 	tpg_ci = &acl_ci->ci_parent->ci_group->cg_item;
@@ -684,7 +684,7 @@ static ssize_t lio_target_nacl_cmdsn_depth_store(struct config_item *item,
 
 	ret = core_tpg_set_initiator_node_queue_depth(se_nacl, cmdsn_depth);
 
-	pr_debug("LIO_Target_ConfigFS: %s/%s Set CmdSN Window: %u for"
+	pr_debug("LIO_Target_ConfigFS: %s/%s Set CmdSN Window: %u for "
 		"InitiatorName: %s\n", config_item_name(wwn_ci),
 		config_item_name(tpg_ci), cmdsn_depth,
 		config_item_name(acl_ci));
@@ -1131,7 +1131,7 @@ static void lio_target_tiqn_deltpg(struct se_portal_group *se_tpg)
 
 /* End items for lio_target_tiqn_cit */
 
-/* Start LIO-Target TIQN struct contig_item lio_target_cit */
+/* Start LIO-Target TIQN struct config_item lio_target_cit */
 
 static ssize_t lio_target_wwn_lio_version_show(struct config_item *item,
 		char *page)
diff --git a/drivers/target/iscsi/iscsi_target_tmr.c b/drivers/target/iscsi/iscsi_target_tmr.c
index f60b156ede12..620de3910599 100644
--- a/drivers/target/iscsi/iscsi_target_tmr.c
+++ b/drivers/target/iscsi/iscsi_target_tmr.c
@@ -112,7 +112,8 @@ u8 iscsit_tmr_task_reassign(
 	struct iscsi_tmr_req *tmr_req = cmd->tmr_req;
 	struct se_tmr_req *se_tmr = cmd->se_cmd.se_tmr_req;
 	struct iscsi_tm *hdr = (struct iscsi_tm *) buf;
-	u64 ret, ref_lun;
+	u64 ref_lun;
+	int ret;
 
 	pr_debug("Got TASK_REASSIGN TMR ITT: 0x%08x,"
 		" RefTaskTag: 0x%08x, ExpDataSN: 0x%08x, CID: %hu\n",
diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index cc88aaa106da..c9bdd4140fd0 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -29,6 +29,10 @@
 #define MCQ_ENTRY_SIZE_IN_DWORD	8
 #define CQE_UCD_BA GENMASK_ULL(63, 7)
 
+#define UFSHCD_ENABLE_MCQ_INTRS	(UTP_TASK_REQ_COMPL |\
+				 UFSHCD_ERROR_MASK |\
+				 MCQ_CQ_EVENT_STATUS)
+
 /* Max mcq register polling time in microseconds */
 #define MCQ_POLL_US 500000
 
@@ -355,9 +359,16 @@ EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_lock);
 void ufshcd_mcq_make_queues_operational(struct ufs_hba *hba)
 {
 	struct ufs_hw_queue *hwq;
+	u32 intrs;
 	u16 qsize;
 	int i;
 
+	/* Enable required interrupts */
+	intrs = UFSHCD_ENABLE_MCQ_INTRS;
+	if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR)
+		intrs &= ~MCQ_CQ_EVENT_STATUS;
+	ufshcd_enable_intr(hba, intrs);
+
 	for (i = 0; i < hba->nr_hw_queues; i++) {
 		hwq = &hba->uhq[i];
 		hwq->id = i;
diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c
index 4bd7d491e3c5..0086816b27cd 100644
--- a/drivers/ufs/core/ufs-sysfs.c
+++ b/drivers/ufs/core/ufs-sysfs.c
@@ -512,6 +512,8 @@ static ssize_t pm_qos_enable_show(struct device *dev,
 {
 	struct ufs_hba *hba = dev_get_drvdata(dev);
 
+	guard(mutex)(&hba->pm_qos_mutex);
+
 	return sysfs_emit(buf, "%d\n", hba->pm_qos_enabled);
 }
 
diff --git a/drivers/ufs/core/ufs_trace.h b/drivers/ufs/core/ufs_trace.h
index caa32e23ffa5..584c2b5c6ad9 100644
--- a/drivers/ufs/core/ufs_trace.h
+++ b/drivers/ufs/core/ufs_trace.h
@@ -11,6 +11,7 @@
 
 #include <ufs/ufs.h>
 #include <linux/tracepoint.h>
+#include "ufs_trace_types.h"
 
 #define str_opcode(opcode)						\
 	__print_symbolic(opcode,					\
diff --git a/drivers/ufs/core/ufs_trace_types.h b/drivers/ufs/core/ufs_trace_types.h
new file mode 100644
index 000000000000..f2d5ad1d92b9
--- /dev/null
+++ b/drivers/ufs/core/ufs_trace_types.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _UFS_TRACE_TYPES_H_
+#define _UFS_TRACE_TYPES_H_
+
+enum ufs_trace_str_t {
+	UFS_CMD_SEND,
+	UFS_CMD_COMP,
+	UFS_DEV_COMP,
+	UFS_QUERY_SEND,
+	UFS_QUERY_COMP,
+	UFS_QUERY_ERR,
+	UFS_TM_SEND,
+	UFS_TM_COMP,
+	UFS_TM_ERR
+};
+
+enum ufs_trace_tsf_t {
+	UFS_TSF_CDB,
+	UFS_TSF_OSF,
+	UFS_TSF_TM_INPUT,
+	UFS_TSF_TM_OUTPUT
+};
+
+#endif /* _UFS_TRACE_TYPES_H_ */
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 9a43102b2b21..d9632d7c5f01 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -45,11 +45,6 @@
 				 UTP_TASK_REQ_COMPL |\
 				 UFSHCD_ERROR_MASK)
 
-#define UFSHCD_ENABLE_MCQ_INTRS	(UTP_TASK_REQ_COMPL |\
-				 UFSHCD_ERROR_MASK |\
-				 MCQ_CQ_EVENT_STATUS)
-
-
 /* UIC command timeout, unit: ms */
 enum {
 	UIC_CMD_TIMEOUT_DEFAULT	= 500,
@@ -316,6 +311,9 @@ static const struct ufs_dev_quirk ufs_fixups[] = {
 	{ .wmanufacturerid = UFS_VENDOR_TOSHIBA,
 	  .model = "THGLF2G9D8KBADG",
 	  .quirk = UFS_DEVICE_QUIRK_PA_TACTIVATE },
+	{ .wmanufacturerid = UFS_VENDOR_TOSHIBA,
+	  .model = "THGJFJT1E45BATP",
+	  .quirk = UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT },
 	{}
 };
 
@@ -369,7 +367,7 @@ EXPORT_SYMBOL_GPL(ufshcd_disable_irq);
  * @hba: per adapter instance
  * @intrs: interrupt bits
  */
-static void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs)
+void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs)
 {
 	u32 old_val = ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
 	u32 new_val = old_val | intrs;
@@ -606,10 +604,12 @@ void ufshcd_print_tr(struct ufs_hba *hba, int tag, bool pr_prdt)
 
 	lrbp = &hba->lrb[tag];
 
-	dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n",
-			tag, div_u64(lrbp->issue_time_stamp_local_clock, 1000));
-	dev_err(hba->dev, "UPIU[%d] - complete time %lld us\n",
-			tag, div_u64(lrbp->compl_time_stamp_local_clock, 1000));
+	if (hba->monitor.enabled) {
+		dev_err(hba->dev, "UPIU[%d] - issue time %lld us\n", tag,
+			div_u64(lrbp->issue_time_stamp_local_clock, 1000));
+		dev_err(hba->dev, "UPIU[%d] - complete time %lld us\n", tag,
+			div_u64(lrbp->compl_time_stamp_local_clock, 1000));
+	}
 	dev_err(hba->dev,
 		"UPIU[%d] - Transfer Request Descriptor phys@0x%llx\n",
 		tag, (u64)lrbp->utrd_dma_addr);
@@ -1045,6 +1045,7 @@ EXPORT_SYMBOL_GPL(ufshcd_is_hba_active);
  */
 void ufshcd_pm_qos_init(struct ufs_hba *hba)
 {
+	guard(mutex)(&hba->pm_qos_mutex);
 
 	if (hba->pm_qos_enabled)
 		return;
@@ -1061,6 +1062,8 @@ void ufshcd_pm_qos_init(struct ufs_hba *hba)
  */
 void ufshcd_pm_qos_exit(struct ufs_hba *hba)
 {
+	guard(mutex)(&hba->pm_qos_mutex);
+
 	if (!hba->pm_qos_enabled)
 		return;
 
@@ -1075,6 +1078,8 @@ void ufshcd_pm_qos_exit(struct ufs_hba *hba)
  */
 static void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on)
 {
+	guard(mutex)(&hba->pm_qos_mutex);
+
 	if (!hba->pm_qos_enabled)
 		return;
 
@@ -2230,11 +2235,13 @@ static void ufshcd_exit_clk_gating(struct ufs_hba *hba)
 static void ufshcd_clk_scaling_start_busy(struct ufs_hba *hba)
 {
 	bool queue_resume_work = false;
-	ktime_t curr_t = ktime_get();
+	ktime_t curr_t;
 
 	if (!ufshcd_is_clkscaling_supported(hba))
 		return;
 
+	curr_t = ktime_get();
+
 	guard(spinlock_irqsave)(&hba->clk_scaling.lock);
 
 	if (!hba->clk_scaling.active_reqs++)
@@ -2354,10 +2361,12 @@ void ufshcd_send_command(struct ufs_hba *hba, unsigned int task_tag,
 	struct ufshcd_lrb *lrbp = &hba->lrb[task_tag];
 	unsigned long flags;
 
-	lrbp->issue_time_stamp = ktime_get();
-	lrbp->issue_time_stamp_local_clock = local_clock();
-	lrbp->compl_time_stamp = ktime_set(0, 0);
-	lrbp->compl_time_stamp_local_clock = 0;
+	if (hba->monitor.enabled) {
+		lrbp->issue_time_stamp = ktime_get();
+		lrbp->issue_time_stamp_local_clock = local_clock();
+		lrbp->compl_time_stamp = ktime_set(0, 0);
+		lrbp->compl_time_stamp_local_clock = 0;
+	}
 	ufshcd_add_command_trace(hba, task_tag, UFS_CMD_SEND);
 	if (lrbp->cmd)
 		ufshcd_clk_scaling_start_busy(hba);
@@ -5622,8 +5631,10 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 	enum utp_ocs ocs;
 
 	lrbp = &hba->lrb[task_tag];
-	lrbp->compl_time_stamp = ktime_get();
-	lrbp->compl_time_stamp_local_clock = local_clock();
+	if (hba->monitor.enabled) {
+		lrbp->compl_time_stamp = ktime_get();
+		lrbp->compl_time_stamp_local_clock = local_clock();
+	}
 	cmd = lrbp->cmd;
 	if (cmd) {
 		if (unlikely(ufshcd_should_inform_monitor(hba, lrbp)))
@@ -6457,13 +6468,14 @@ void ufshcd_schedule_eh_work(struct ufs_hba *hba)
 	}
 }
 
-static void ufshcd_force_error_recovery(struct ufs_hba *hba)
+void ufshcd_force_error_recovery(struct ufs_hba *hba)
 {
 	spin_lock_irq(hba->host->host_lock);
 	hba->force_reset = true;
 	ufshcd_schedule_eh_work(hba);
 	spin_unlock_irq(hba->host->host_lock);
 }
+EXPORT_SYMBOL_GPL(ufshcd_force_error_recovery);
 
 static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow)
 {
@@ -8786,7 +8798,8 @@ static void ufshcd_set_timestamp_attr(struct ufs_hba *hba)
 	struct ufs_dev_info *dev_info = &hba->dev_info;
 	struct utp_upiu_query_v4_0 *upiu_data;
 
-	if (dev_info->wspecversion < 0x400)
+	if (dev_info->wspecversion < 0x400 ||
+	    hba->dev_quirks & UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT)
 		return;
 
 	ufshcd_dev_man_lock(hba);
@@ -8913,16 +8926,11 @@ err:
 static void ufshcd_config_mcq(struct ufs_hba *hba)
 {
 	int ret;
-	u32 intrs;
 
 	ret = ufshcd_mcq_vops_config_esi(hba);
 	hba->mcq_esi_enabled = !ret;
 	dev_info(hba->dev, "ESI %sconfigured\n", ret ? "is not " : "");
 
-	intrs = UFSHCD_ENABLE_MCQ_INTRS;
-	if (hba->quirks & UFSHCD_QUIRK_MCQ_BROKEN_INTR)
-		intrs &= ~MCQ_CQ_EVENT_STATUS;
-	ufshcd_enable_intr(hba, intrs);
 	ufshcd_mcq_make_queues_operational(hba);
 	ufshcd_mcq_config_mac(hba, hba->nutrs);
 
@@ -10756,6 +10764,10 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
 	mutex_init(&hba->ee_ctrl_mutex);
 
 	mutex_init(&hba->wb_mutex);
+
+	/* Initialize mutex for PM QoS request synchronization */
+	mutex_init(&hba->pm_qos_mutex);
+
 	init_rwsem(&hba->clk_scaling_lock);
 
 	ufshcd_init_clk_gating(hba);
diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c
index f0adcd9dd553..70d195179eba 100644
--- a/drivers/ufs/host/ufs-exynos.c
+++ b/drivers/ufs/host/ufs-exynos.c
@@ -776,7 +776,7 @@ static void exynos_ufs_config_sync_pattern_mask(struct exynos_ufs *ufs,
 	u32 mask, sync_len;
 	enum {
 		SYNC_LEN_G1 = 80 * 1000, /* 80us */
-		SYNC_LEN_G2 = 40 * 1000, /* 44us */
+		SYNC_LEN_G2 = 40 * 1000, /* 40us */
 		SYNC_LEN_G3 = 20 * 1000, /* 20us */
 	};
 	int i;
@@ -1896,6 +1896,13 @@ static int fsd_ufs_pre_pwr_change(struct exynos_ufs *ufs,
 	return 0;
 }
 
+static int fsd_ufs_suspend(struct exynos_ufs *ufs)
+{
+	exynos_ufs_gate_clks(ufs);
+	hci_writel(ufs, 0, HCI_GPIO_OUT);
+	return 0;
+}
+
 static inline u32 get_mclk_period_unipro_18(struct exynos_ufs *ufs)
 {
 	return (16 * 1000 * 1000000UL / ufs->mclk_rate);
@@ -2162,6 +2169,7 @@ static const struct exynos_ufs_drv_data fsd_ufs_drvs = {
 	.pre_link               = fsd_ufs_pre_link,
 	.post_link              = fsd_ufs_post_link,
 	.pre_pwr_change         = fsd_ufs_pre_pwr_change,
+	.suspend                = fsd_ufs_suspend,
 };
 
 static const struct exynos_ufs_drv_data gs101_ufs_drvs = {
diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c
index f902ce08c95a..758a393a9de1 100644
--- a/drivers/ufs/host/ufs-mediatek.c
+++ b/drivers/ufs/host/ufs-mediatek.c
@@ -29,6 +29,7 @@
 #include "ufs-mediatek-sip.h"
 
 static int  ufs_mtk_config_mcq(struct ufs_hba *hba, bool irq);
+static void _ufs_mtk_clk_scale(struct ufs_hba *hba, bool scale_up);
 
 #define CREATE_TRACE_POINTS
 #include "ufs-mediatek-trace.h"
@@ -415,7 +416,7 @@ static void ufs_mtk_dbg_sel(struct ufs_hba *hba)
 	}
 }
 
-static void ufs_mtk_wait_idle_state(struct ufs_hba *hba,
+static int ufs_mtk_wait_idle_state(struct ufs_hba *hba,
 			    unsigned long retry_ms)
 {
 	u64 timeout, time_checked;
@@ -451,8 +452,12 @@ static void ufs_mtk_wait_idle_state(struct ufs_hba *hba,
 			break;
 	} while (time_checked < timeout);
 
-	if (wait_idle && sm != VS_HCE_BASE)
+	if (wait_idle && sm != VS_HCE_BASE) {
 		dev_info(hba->dev, "wait idle tmo: 0x%x\n", val);
+		return -ETIMEDOUT;
+	}
+
+	return 0;
 }
 
 static int ufs_mtk_wait_link_state(struct ufs_hba *hba, u32 state,
@@ -798,8 +803,14 @@ static int ufs_mtk_setup_clocks(struct ufs_hba *hba, bool on,
 				clk_pwr_off = true;
 		}
 
-		if (clk_pwr_off)
+		if (clk_pwr_off) {
 			ufs_mtk_pwr_ctrl(hba, false);
+		} else {
+			dev_warn(hba->dev, "Clock is not turned off, hba->ahit = 0x%x, AHIT = 0x%x\n",
+				hba->ahit,
+				ufshcd_readl(hba,
+					REG_AUTO_HIBERNATE_IDLE_TIMER));
+		}
 		ufs_mtk_mcq_disable_irq(hba);
 	} else if (on && status == POST_CHANGE) {
 		ufs_mtk_pwr_ctrl(hba, true);
@@ -1018,7 +1029,7 @@ static int ufs_mtk_vreg_fix_vcc(struct ufs_hba *hba)
 	struct arm_smccc_res res;
 	int err, ver;
 
-	if (hba->vreg_info.vcc)
+	if (info->vcc)
 		return 0;
 
 	if (of_property_read_bool(np, "mediatek,ufs-vcc-by-num")) {
@@ -1075,6 +1086,80 @@ static void ufs_mtk_vreg_fix_vccqx(struct ufs_hba *hba)
 	}
 }
 
+static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba)
+{
+	unsigned long flags;
+	u32 ah_ms = 10;
+	u32 ah_scale, ah_timer;
+	u32 scale_us[] = {1, 10, 100, 1000, 10000, 100000};
+
+	if (ufshcd_is_clkgating_allowed(hba)) {
+		if (ufshcd_is_auto_hibern8_supported(hba) && hba->ahit) {
+			ah_scale = FIELD_GET(UFSHCI_AHIBERN8_SCALE_MASK,
+					  hba->ahit);
+			ah_timer = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK,
+					  hba->ahit);
+			if (ah_scale <= 5)
+				ah_ms = ah_timer * scale_us[ah_scale] / 1000;
+		}
+
+		spin_lock_irqsave(hba->host->host_lock, flags);
+		hba->clk_gating.delay_ms = max(ah_ms, 10U);
+		spin_unlock_irqrestore(hba->host->host_lock, flags);
+	}
+}
+
+/* Convert microseconds to Auto-Hibernate Idle Timer register value */
+static u32 ufs_mtk_us_to_ahit(unsigned int timer)
+{
+	unsigned int scale;
+
+	for (scale = 0; timer > UFSHCI_AHIBERN8_TIMER_MASK; ++scale)
+		timer /= UFSHCI_AHIBERN8_SCALE_FACTOR;
+
+	return FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, timer) |
+	       FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, scale);
+}
+
+static void ufs_mtk_fix_ahit(struct ufs_hba *hba)
+{
+	unsigned int us;
+
+	if (ufshcd_is_auto_hibern8_supported(hba)) {
+		switch (hba->dev_info.wmanufacturerid) {
+		case UFS_VENDOR_SAMSUNG:
+			/* configure auto-hibern8 timer to 3.5 ms */
+			us = 3500;
+			break;
+
+		case UFS_VENDOR_MICRON:
+			/* configure auto-hibern8 timer to 2 ms */
+			us = 2000;
+			break;
+
+		default:
+			/* configure auto-hibern8 timer to 1 ms */
+			us = 1000;
+			break;
+		}
+
+		hba->ahit = ufs_mtk_us_to_ahit(us);
+	}
+
+	ufs_mtk_setup_clk_gating(hba);
+}
+
+static void ufs_mtk_fix_clock_scaling(struct ufs_hba *hba)
+{
+	/* UFS version is below 4.0, clock scaling is not necessary */
+	if ((hba->dev_info.wspecversion < 0x0400)  &&
+		ufs_mtk_is_clk_scale_ready(hba)) {
+		hba->caps &= ~UFSHCD_CAP_CLK_SCALING;
+
+		_ufs_mtk_clk_scale(hba, false);
+	}
+}
+
 static void ufs_mtk_init_mcq_irq(struct ufs_hba *hba)
 {
 	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
@@ -1240,6 +1325,10 @@ static bool ufs_mtk_pmc_via_fastauto(struct ufs_hba *hba,
 	    dev_req_params->gear_rx < UFS_HS_G4)
 		return false;
 
+	if (dev_req_params->pwr_tx == SLOW_MODE ||
+	    dev_req_params->pwr_rx == SLOW_MODE)
+		return false;
+
 	return true;
 }
 
@@ -1255,6 +1344,10 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 	host_params.hs_rx_gear = UFS_HS_G5;
 	host_params.hs_tx_gear = UFS_HS_G5;
 
+	if (dev_max_params->pwr_rx == SLOW_MODE ||
+	    dev_max_params->pwr_tx == SLOW_MODE)
+		host_params.desired_working_mode = UFS_PWM_MODE;
+
 	ret = ufshcd_negotiate_pwr_params(&host_params, dev_max_params, dev_req_params);
 	if (ret) {
 		pr_info("%s: failed to determine capabilities\n",
@@ -1278,6 +1371,28 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 		ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TXHSADAPTTYPE),
 			       PA_NO_ADAPT);
 
+		if (!(hba->quirks & UFSHCD_QUIRK_SKIP_DEF_UNIPRO_TIMEOUT_SETTING)) {
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA0),
+					DL_FC0ProtectionTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA1),
+					DL_TC0ReplayTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA2),
+					DL_AFC0ReqTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA3),
+					DL_FC1ProtectionTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA4),
+					DL_TC1ReplayTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(PA_PWRMODEUSERDATA5),
+					DL_AFC1ReqTimeOutVal_Default);
+
+			ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalFC0ProtectionTimeOutVal),
+					DL_FC0ProtectionTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalTC0ReplayTimeOutVal),
+					DL_TC0ReplayTimeOutVal_Default);
+			ufshcd_dme_set(hba, UIC_ARG_MIB(DME_LocalAFC0ReqTimeOutVal),
+					DL_AFC0ReqTimeOutVal_Default);
+		}
+
 		ret = ufshcd_uic_change_pwr_mode(hba,
 					FASTAUTO_MODE << 4 | FASTAUTO_MODE);
 
@@ -1287,10 +1402,59 @@ static int ufs_mtk_pre_pwr_change(struct ufs_hba *hba,
 		}
 	}
 
-	if (host->hw_ver.major >= 3) {
+	/* if already configured to the requested pwr_mode, skip adapt */
+	if (dev_req_params->gear_rx == hba->pwr_info.gear_rx &&
+	    dev_req_params->gear_tx == hba->pwr_info.gear_tx &&
+	    dev_req_params->lane_rx == hba->pwr_info.lane_rx &&
+	    dev_req_params->lane_tx == hba->pwr_info.lane_tx &&
+	    dev_req_params->pwr_rx == hba->pwr_info.pwr_rx &&
+	    dev_req_params->pwr_tx == hba->pwr_info.pwr_tx &&
+	    dev_req_params->hs_rate == hba->pwr_info.hs_rate) {
+		return ret;
+	}
+
+	if (dev_req_params->pwr_rx == FAST_MODE ||
+	    dev_req_params->pwr_rx == FASTAUTO_MODE) {
+		if (host->hw_ver.major >= 3) {
+			ret = ufshcd_dme_configure_adapt(hba,
+						   dev_req_params->gear_tx,
+						   PA_INITIAL_ADAPT);
+		} else {
+			ret = ufshcd_dme_configure_adapt(hba,
+				   dev_req_params->gear_tx,
+				   PA_NO_ADAPT);
+		}
+	} else {
 		ret = ufshcd_dme_configure_adapt(hba,
-					   dev_req_params->gear_tx,
-					   PA_INITIAL_ADAPT);
+			   dev_req_params->gear_tx,
+			   PA_NO_ADAPT);
+	}
+
+	return ret;
+}
+
+static int ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba)
+{
+	int ret;
+
+	/* disable auto-hibern8 */
+	ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER);
+
+	/* wait host return to idle state when auto-hibern8 off */
+	ret = ufs_mtk_wait_idle_state(hba, 5);
+	if (ret)
+		goto out;
+
+	ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100);
+
+out:
+	if (ret) {
+		dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret);
+
+		ufshcd_force_error_recovery(hba);
+
+		/* trigger error handler and break suspend */
+		ret = -EBUSY;
 	}
 
 	return ret;
@@ -1302,13 +1466,20 @@ static int ufs_mtk_pwr_change_notify(struct ufs_hba *hba,
 				struct ufs_pa_layer_attr *dev_req_params)
 {
 	int ret = 0;
+	static u32 reg;
 
 	switch (stage) {
 	case PRE_CHANGE:
+		if (ufshcd_is_auto_hibern8_supported(hba)) {
+			reg = ufshcd_readl(hba, REG_AUTO_HIBERNATE_IDLE_TIMER);
+			ufs_mtk_auto_hibern8_disable(hba);
+		}
 		ret = ufs_mtk_pre_pwr_change(hba, dev_max_params,
 					     dev_req_params);
 		break;
 	case POST_CHANGE:
+		if (ufshcd_is_auto_hibern8_supported(hba))
+			ufshcd_writel(hba, reg, REG_AUTO_HIBERNATE_IDLE_TIMER);
 		break;
 	default:
 		ret = -EINVAL;
@@ -1342,6 +1513,7 @@ static int ufs_mtk_pre_link(struct ufs_hba *hba)
 {
 	int ret;
 	u32 tmp;
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
 
 	ufs_mtk_get_controller_version(hba);
 
@@ -1367,34 +1539,33 @@ static int ufs_mtk_pre_link(struct ufs_hba *hba)
 
 	ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_SAVEPOWERCONTROL), tmp);
 
+	/* Enable the 1144 functions setting */
+	if (host->ip_ver == IP_VER_MT6989) {
+		ret = ufshcd_dme_get(hba, UIC_ARG_MIB(VS_DEBUGOMC), &tmp);
+		if (ret)
+			return ret;
+
+		tmp |= 0x10;
+		ret = ufshcd_dme_set(hba, UIC_ARG_MIB(VS_DEBUGOMC), tmp);
+	}
+
 	return ret;
 }
 
-static void ufs_mtk_setup_clk_gating(struct ufs_hba *hba)
+static void ufs_mtk_post_link(struct ufs_hba *hba)
 {
-	u32 ah_ms;
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
+	u32 tmp;
 
-	if (ufshcd_is_clkgating_allowed(hba)) {
-		if (ufshcd_is_auto_hibern8_supported(hba) && hba->ahit)
-			ah_ms = FIELD_GET(UFSHCI_AHIBERN8_TIMER_MASK,
-					  hba->ahit);
-		else
-			ah_ms = 10;
-		ufshcd_clkgate_delay_set(hba->dev, ah_ms + 5);
+	/* fix device PA_INIT no adapt */
+	if (host->ip_ver >= IP_VER_MT6899) {
+		ufshcd_dme_get(hba, UIC_ARG_MIB(VS_DEBUGOMC), &tmp);
+		tmp |= 0x100;
+		ufshcd_dme_set(hba, UIC_ARG_MIB(VS_DEBUGOMC), tmp);
 	}
-}
 
-static void ufs_mtk_post_link(struct ufs_hba *hba)
-{
 	/* enable unipro clock gating feature */
 	ufs_mtk_cfg_unipro_cg(hba, true);
-
-	/* will be configured during probe hba */
-	if (ufshcd_is_auto_hibern8_supported(hba))
-		hba->ahit = FIELD_PREP(UFSHCI_AHIBERN8_TIMER_MASK, 10) |
-			FIELD_PREP(UFSHCI_AHIBERN8_SCALE_MASK, 3);
-
-	ufs_mtk_setup_clk_gating(hba);
 }
 
 static int ufs_mtk_link_startup_notify(struct ufs_hba *hba,
@@ -1421,11 +1592,11 @@ static int ufs_mtk_device_reset(struct ufs_hba *hba)
 {
 	struct arm_smccc_res res;
 
-	/* disable hba before device reset */
-	ufshcd_hba_stop(hba);
-
 	ufs_mtk_device_reset_ctrl(0, res);
 
+	/* disable hba in middle of device reset */
+	ufshcd_hba_stop(hba);
+
 	/*
 	 * The reset signal is active low. UFS devices shall detect
 	 * more than or equal to 1us of positive or negative RST_n
@@ -1462,7 +1633,11 @@ static int ufs_mtk_link_set_hpm(struct ufs_hba *hba)
 		return err;
 
 	/* Check link state to make sure exit h8 success */
-	ufs_mtk_wait_idle_state(hba, 5);
+	err = ufs_mtk_wait_idle_state(hba, 5);
+	if (err) {
+		dev_warn(hba->dev, "wait idle fail, err=%d\n", err);
+		return err;
+	}
 	err = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100);
 	if (err) {
 		dev_warn(hba->dev, "exit h8 state fail, err=%d\n", err);
@@ -1507,6 +1682,9 @@ static void ufs_mtk_vccqx_set_lpm(struct ufs_hba *hba, bool lpm)
 {
 	struct ufs_vreg *vccqx = NULL;
 
+	if (!hba->vreg_info.vccq && !hba->vreg_info.vccq2)
+		return;
+
 	if (hba->vreg_info.vccq)
 		vccqx = hba->vreg_info.vccq;
 	else
@@ -1561,21 +1739,6 @@ static void ufs_mtk_dev_vreg_set_lpm(struct ufs_hba *hba, bool lpm)
 	}
 }
 
-static void ufs_mtk_auto_hibern8_disable(struct ufs_hba *hba)
-{
-	int ret;
-
-	/* disable auto-hibern8 */
-	ufshcd_writel(hba, 0, REG_AUTO_HIBERNATE_IDLE_TIMER);
-
-	/* wait host return to idle state when auto-hibern8 off */
-	ufs_mtk_wait_idle_state(hba, 5);
-
-	ret = ufs_mtk_wait_link_state(hba, VS_LINK_UP, 100);
-	if (ret)
-		dev_warn(hba->dev, "exit h8 state fail, ret=%d\n", ret);
-}
-
 static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op,
 	enum ufs_notify_change_status status)
 {
@@ -1584,7 +1747,7 @@ static int ufs_mtk_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op,
 
 	if (status == PRE_CHANGE) {
 		if (ufshcd_is_auto_hibern8_supported(hba))
-			ufs_mtk_auto_hibern8_disable(hba);
+			return ufs_mtk_auto_hibern8_disable(hba);
 		return 0;
 	}
 
@@ -1642,8 +1805,21 @@ static int ufs_mtk_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 	}
 
 	return 0;
+
 fail:
-	return ufshcd_link_recovery(hba);
+	/*
+	 * Check if the platform (parent) device has resumed, and ensure that
+	 * power, clock, and MTCMOS are all turned on.
+	 */
+	err = ufshcd_link_recovery(hba);
+	if (err) {
+		dev_err(hba->dev, "Device PM: req=%d, status:%d, err:%d\n",
+			hba->dev->power.request,
+			hba->dev->power.runtime_status,
+			hba->dev->power.runtime_error);
+	}
+
+	return 0; /* Cannot return a failure, otherwise, the I/O will hang. */
 }
 
 static void ufs_mtk_dbg_register_dump(struct ufs_hba *hba)
@@ -1726,6 +1902,8 @@ static void ufs_mtk_fixup_dev_quirks(struct ufs_hba *hba)
 
 	ufs_mtk_vreg_fix_vcc(hba);
 	ufs_mtk_vreg_fix_vccqx(hba);
+	ufs_mtk_fix_ahit(hba);
+	ufs_mtk_fix_clock_scaling(hba);
 }
 
 static void ufs_mtk_event_notify(struct ufs_hba *hba,
@@ -2012,6 +2190,7 @@ static int ufs_mtk_config_mcq_irq(struct ufs_hba *hba)
 			return ret;
 		}
 	}
+	host->is_mcq_intr_enabled = true;
 
 	return 0;
 }
@@ -2095,10 +2274,12 @@ static const struct ufs_hba_variant_ops ufs_hba_mtk_vops = {
 static int ufs_mtk_probe(struct platform_device *pdev)
 {
 	int err;
-	struct device *dev = &pdev->dev;
-	struct device_node *reset_node;
-	struct platform_device *reset_pdev;
+	struct device *dev = &pdev->dev, *phy_dev = NULL;
+	struct device_node *reset_node, *phy_node = NULL;
+	struct platform_device *reset_pdev, *phy_pdev = NULL;
 	struct device_link *link;
+	struct ufs_hba *hba;
+	struct ufs_mtk_host *host;
 
 	reset_node = of_find_compatible_node(NULL, NULL,
 					     "ti,syscon-reset");
@@ -2125,13 +2306,51 @@ static int ufs_mtk_probe(struct platform_device *pdev)
 	}
 
 skip_reset:
+	/* find phy node */
+	phy_node = of_parse_phandle(dev->of_node, "phys", 0);
+
+	if (phy_node) {
+		phy_pdev = of_find_device_by_node(phy_node);
+		if (!phy_pdev)
+			goto skip_phy;
+		phy_dev = &phy_pdev->dev;
+
+		pm_runtime_set_active(phy_dev);
+		pm_runtime_enable(phy_dev);
+		pm_runtime_get_sync(phy_dev);
+
+		put_device(phy_dev);
+		dev_info(dev, "phys node found\n");
+	} else {
+		dev_notice(dev, "phys node not found\n");
+	}
+
+skip_phy:
 	/* perform generic probe */
 	err = ufshcd_pltfrm_init(pdev, &ufs_hba_mtk_vops);
-
-out:
-	if (err)
+	if (err) {
 		dev_err(dev, "probe failed %d\n", err);
+		goto out;
+	}
+
+	hba = platform_get_drvdata(pdev);
+	if (!hba)
+		goto out;
+
+	if (phy_node && phy_dev) {
+		host = ufshcd_get_variant(hba);
+		host->phy_dev = phy_dev;
+	}
+
+	/*
+	 * Because the default power setting of VSx (the upper layer of
+	 * VCCQ/VCCQ2) is HWLP, we need to prevent VCCQ/VCCQ2 from
+	 * entering LPM.
+	 */
+	ufs_mtk_dev_vreg_set_lpm(hba, false);
 
+out:
+	of_node_put(phy_node);
 	of_node_put(reset_node);
 	return err;
 }
@@ -2156,27 +2375,38 @@ static int ufs_mtk_system_suspend(struct device *dev)
 
 	ret = ufshcd_system_suspend(dev);
 	if (ret)
-		return ret;
+		goto out;
+
+	if (pm_runtime_suspended(hba->dev))
+		goto out;
 
 	ufs_mtk_dev_vreg_set_lpm(hba, true);
 
 	if (ufs_mtk_is_rtff_mtcmos(hba))
 		ufs_mtk_mtcmos_ctrl(false, res);
 
-	return 0;
+out:
+	return ret;
 }
 
 static int ufs_mtk_system_resume(struct device *dev)
 {
+	int ret = 0;
 	struct ufs_hba *hba = dev_get_drvdata(dev);
 	struct arm_smccc_res res;
 
-	ufs_mtk_dev_vreg_set_lpm(hba, false);
+	if (pm_runtime_suspended(hba->dev))
+		goto out;
 
 	if (ufs_mtk_is_rtff_mtcmos(hba))
 		ufs_mtk_mtcmos_ctrl(true, res);
 
-	return ufshcd_system_resume(dev);
+	ufs_mtk_dev_vreg_set_lpm(hba, false);
+
+out:
+	ret = ufshcd_system_resume(dev);
+
+	return ret;
 }
 #endif
 
@@ -2184,6 +2414,7 @@ static int ufs_mtk_system_resume(struct device *dev)
 static int ufs_mtk_runtime_suspend(struct device *dev)
 {
 	struct ufs_hba *hba = dev_get_drvdata(dev);
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
 	struct arm_smccc_res res;
 	int ret = 0;
 
@@ -2196,17 +2427,24 @@ static int ufs_mtk_runtime_suspend(struct device *dev)
 	if (ufs_mtk_is_rtff_mtcmos(hba))
 		ufs_mtk_mtcmos_ctrl(false, res);
 
+	if (host->phy_dev)
+		pm_runtime_put_sync(host->phy_dev);
+
 	return 0;
 }
 
 static int ufs_mtk_runtime_resume(struct device *dev)
 {
 	struct ufs_hba *hba = dev_get_drvdata(dev);
+	struct ufs_mtk_host *host = ufshcd_get_variant(hba);
 	struct arm_smccc_res res;
 
 	if (ufs_mtk_is_rtff_mtcmos(hba))
 		ufs_mtk_mtcmos_ctrl(true, res);
 
+	if (host->phy_dev)
+		pm_runtime_get_sync(host->phy_dev);
+
 	ufs_mtk_dev_vreg_set_lpm(hba, false);
 
 	return ufshcd_runtime_resume(dev);
diff --git a/drivers/ufs/host/ufs-mediatek.h b/drivers/ufs/host/ufs-mediatek.h
index e46dc5fa209d..dfbf78bd8664 100644
--- a/drivers/ufs/host/ufs-mediatek.h
+++ b/drivers/ufs/host/ufs-mediatek.h
@@ -193,6 +193,7 @@ struct ufs_mtk_host {
 	bool is_mcq_intr_enabled;
 	int mcq_nr_intr;
 	struct ufs_mtk_mcq_intr_info mcq_intr_info[UFSHCD_MAX_Q_NR];
+	struct device *phy_dev;
 };
 
 /* MTK delay of autosuspend: 500 ms */
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 9574fdc2bb0f..3e83dc51d538 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -38,6 +38,9 @@
 #define DEEMPHASIS_3_5_dB	0x04
 #define NO_DEEMPHASIS		0x0
 
+#define UFS_ICE_SYNC_RST_SEL	BIT(3)
+#define UFS_ICE_SYNC_RST_SW	BIT(4)
+
 enum {
 	TSTBUS_UAWM,
 	TSTBUS_UARM,
@@ -494,12 +497,8 @@ static int ufs_qcom_power_up_sequence(struct ufs_hba *hba)
 	 * If the HS-G5 PHY gear is used, update host_params->hs_rate to Rate-A,
 	 * so that the subsequent power mode change shall stick to Rate-A.
 	 */
-	if (host->hw_ver.major == 0x5) {
-		if (host->phy_gear == UFS_HS_G5)
-			host_params->hs_rate = PA_HS_MODE_A;
-		else
-			host_params->hs_rate = PA_HS_MODE_B;
-	}
+	if (host->hw_ver.major == 0x5 && host->phy_gear == UFS_HS_G5)
+		host_params->hs_rate = PA_HS_MODE_A;
 
 	mode = host_params->hs_rate == PA_HS_MODE_B ? PHY_MODE_UFS_HS_B : PHY_MODE_UFS_HS_A;
 
@@ -751,11 +750,29 @@ static int ufs_qcom_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op)
 {
 	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
 	int err;
+	u32 reg_val;
 
 	err = ufs_qcom_enable_lane_clks(host);
 	if (err)
 		return err;
 
+	if ((!ufs_qcom_is_link_active(hba)) &&
+	    host->hw_ver.major == 5 &&
+	    host->hw_ver.minor == 0 &&
+	    host->hw_ver.step == 0) {
+		ufshcd_writel(hba, UFS_ICE_SYNC_RST_SEL | UFS_ICE_SYNC_RST_SW, UFS_MEM_ICE_CFG);
+		reg_val = ufshcd_readl(hba, UFS_MEM_ICE_CFG);
+		reg_val &= ~(UFS_ICE_SYNC_RST_SEL | UFS_ICE_SYNC_RST_SW);
+		/*
+		 * HW documentation doesn't recommend any delay between the
+		 * reset set and clear. But we are enforcing an arbitrary delay
+		 * to give flops enough time to settle in.
+		 */
+		usleep_range(50, 100);
+		ufshcd_writel(hba, reg_val, UFS_MEM_ICE_CFG);
+		ufshcd_readl(hba, UFS_MEM_ICE_CFG);
+	}
+
 	return ufs_qcom_ice_resume(host);
 }
 
@@ -1096,6 +1113,18 @@ static void ufs_qcom_set_phy_gear(struct ufs_qcom_host *host)
 	}
 }
 
+static void ufs_qcom_parse_gear_limits(struct ufs_hba *hba)
+{
+	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
+	struct ufs_host_params *host_params = &host->host_params;
+	u32 hs_gear_old = host_params->hs_tx_gear;
+
+	ufshcd_parse_gear_limits(hba, host_params);
+	if (host_params->hs_tx_gear != hs_gear_old) {
+		host->phy_gear = host_params->hs_tx_gear;
+	}
+}
+
 static void ufs_qcom_set_host_params(struct ufs_hba *hba)
 {
 	struct ufs_qcom_host *host = ufshcd_get_variant(hba);
@@ -1162,6 +1191,13 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on,
 	case PRE_CHANGE:
 		if (on) {
 			ufs_qcom_icc_update_bw(host);
+			if (ufs_qcom_is_link_hibern8(hba)) {
+				err = ufs_qcom_enable_lane_clks(host);
+				if (err) {
+					dev_err(hba->dev, "enable lane clks failed, ret=%d\n", err);
+					return err;
+				}
+			}
 		} else {
 			if (!ufs_qcom_is_link_active(hba)) {
 				/* disable device ref_clk */
@@ -1187,6 +1223,9 @@ static int ufs_qcom_setup_clocks(struct ufs_hba *hba, bool on,
 			if (ufshcd_is_hs_mode(&hba->pwr_info))
 				ufs_qcom_dev_ref_clk_ctrl(host, true);
 		} else {
+			if (ufs_qcom_is_link_hibern8(hba))
+				ufs_qcom_disable_lane_clks(host);
+
 			ufs_qcom_icc_set_bw(host, ufs_qcom_bw_table[MODE_MIN][0][0].mem_bw,
 					    ufs_qcom_bw_table[MODE_MIN][0][0].cfg_bw);
 		}
@@ -1337,6 +1376,7 @@ static int ufs_qcom_init(struct ufs_hba *hba)
 	ufs_qcom_advertise_quirks(hba);
 	ufs_qcom_set_host_params(hba);
 	ufs_qcom_set_phy_gear(host);
+	ufs_qcom_parse_gear_limits(hba);
 
 	err = ufs_qcom_ice_init(host);
 	if (err)
@@ -1742,7 +1782,7 @@ static void ufs_qcom_dump_testbus(struct ufs_hba *hba)
 }
 
 static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len,
-			      const char *prefix, enum ufshcd_res id)
+			      const char *prefix, void __iomem *base)
 {
 	u32 *regs __free(kfree) = NULL;
 	size_t pos;
@@ -1755,7 +1795,7 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len,
 		return -ENOMEM;
 
 	for (pos = 0; pos < len; pos += 4)
-		regs[pos / 4] = readl(hba->res[id].base + offset + pos);
+		regs[pos / 4] = readl(base + offset + pos);
 
 	print_hex_dump(KERN_ERR, prefix,
 		       len > 4 ? DUMP_PREFIX_OFFSET : DUMP_PREFIX_NONE,
@@ -1766,30 +1806,34 @@ static int ufs_qcom_dump_regs(struct ufs_hba *hba, size_t offset, size_t len,
 
 static void ufs_qcom_dump_mcq_hci_regs(struct ufs_hba *hba)
 {
+	struct ufshcd_mcq_opr_info_t *opr = &hba->mcq_opr[0];
+	void __iomem *mcq_vs_base = hba->mcq_base + UFS_MEM_VS_BASE;
+
 	struct dump_info {
+		void __iomem *base;
 		size_t offset;
 		size_t len;
 		const char *prefix;
-		enum ufshcd_res id;
 	};
 
 	struct dump_info mcq_dumps[] = {
-		{0x0, 256 * 4, "MCQ HCI-0 ", RES_MCQ},
-		{0x400, 256 * 4, "MCQ HCI-1 ", RES_MCQ},
-		{0x0, 5 * 4, "MCQ VS-0 ", RES_MCQ_VS},
-		{0x0, 256 * 4, "MCQ SQD-0 ", RES_MCQ_SQD},
-		{0x400, 256 * 4, "MCQ SQD-1 ", RES_MCQ_SQD},
-		{0x800, 256 * 4, "MCQ SQD-2 ", RES_MCQ_SQD},
-		{0xc00, 256 * 4, "MCQ SQD-3 ", RES_MCQ_SQD},
-		{0x1000, 256 * 4, "MCQ SQD-4 ", RES_MCQ_SQD},
-		{0x1400, 256 * 4, "MCQ SQD-5 ", RES_MCQ_SQD},
-		{0x1800, 256 * 4, "MCQ SQD-6 ", RES_MCQ_SQD},
-		{0x1c00, 256 * 4, "MCQ SQD-7 ", RES_MCQ_SQD},
+		{hba->mcq_base, 0x0, 256 * 4, "MCQ HCI-0 "},
+		{hba->mcq_base, 0x400, 256 * 4, "MCQ HCI-1 "},
+		{mcq_vs_base, 0x0, 5 * 4, "MCQ VS-0 "},
+		{opr->base, 0x0, 256 * 4, "MCQ SQD-0 "},
+		{opr->base, 0x400, 256 * 4, "MCQ SQD-1 "},
+		{opr->base, 0x800, 256 * 4, "MCQ SQD-2 "},
+		{opr->base, 0xc00, 256 * 4, "MCQ SQD-3 "},
+		{opr->base, 0x1000, 256 * 4, "MCQ SQD-4 "},
+		{opr->base, 0x1400, 256 * 4, "MCQ SQD-5 "},
+		{opr->base, 0x1800, 256 * 4, "MCQ SQD-6 "},
+		{opr->base, 0x1c00, 256 * 4, "MCQ SQD-7 "},
+
 	};
 
 	for (int i = 0; i < ARRAY_SIZE(mcq_dumps); i++) {
 		ufs_qcom_dump_regs(hba, mcq_dumps[i].offset, mcq_dumps[i].len,
-				   mcq_dumps[i].prefix, mcq_dumps[i].id);
+				   mcq_dumps[i].prefix, mcq_dumps[i].base);
 		cond_resched();
 	}
 }
@@ -1910,116 +1954,68 @@ static void ufs_qcom_config_scaling_param(struct ufs_hba *hba,
 	hba->clk_scaling.suspend_on_no_request = true;
 }
 
-/* Resources */
-static const struct ufshcd_res_info ufs_res_info[RES_MAX] = {
-	{.name = "ufs_mem",},
-	{.name = "mcq",},
-	/* Submission Queue DAO */
-	{.name = "mcq_sqd",},
-	/* Submission Queue Interrupt Status */
-	{.name = "mcq_sqis",},
-	/* Completion Queue DAO */
-	{.name = "mcq_cqd",},
-	/* Completion Queue Interrupt Status */
-	{.name = "mcq_cqis",},
-	/* MCQ vendor specific */
-	{.name = "mcq_vs",},
-};
-
 static int ufs_qcom_mcq_config_resource(struct ufs_hba *hba)
 {
 	struct platform_device *pdev = to_platform_device(hba->dev);
-	struct ufshcd_res_info *res;
-	struct resource *res_mem, *res_mcq;
-	int i, ret;
-
-	memcpy(hba->res, ufs_res_info, sizeof(ufs_res_info));
-
-	for (i = 0; i < RES_MAX; i++) {
-		res = &hba->res[i];
-		res->resource = platform_get_resource_byname(pdev,
-							     IORESOURCE_MEM,
-							     res->name);
-		if (!res->resource) {
-			dev_info(hba->dev, "Resource %s not provided\n", res->name);
-			if (i == RES_UFS)
-				return -ENODEV;
-			continue;
-		} else if (i == RES_UFS) {
-			res_mem = res->resource;
-			res->base = hba->mmio_base;
-			continue;
-		}
+	struct resource *res;
 
-		res->base = devm_ioremap_resource(hba->dev, res->resource);
-		if (IS_ERR(res->base)) {
-			dev_err(hba->dev, "Failed to map res %s, err=%d\n",
-					 res->name, (int)PTR_ERR(res->base));
-			ret = PTR_ERR(res->base);
-			res->base = NULL;
-			return ret;
-		}
+	/* Map the MCQ configuration region */
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mcq");
+	if (!res) {
+		dev_err(hba->dev, "MCQ resource not found in device tree\n");
+		return -ENODEV;
 	}
 
-	/* MCQ resource provided in DT */
-	res = &hba->res[RES_MCQ];
-	/* Bail if MCQ resource is provided */
-	if (res->base)
-		goto out;
-
-	/* Explicitly allocate MCQ resource from ufs_mem */
-	res_mcq = devm_kzalloc(hba->dev, sizeof(*res_mcq), GFP_KERNEL);
-	if (!res_mcq)
-		return -ENOMEM;
-
-	res_mcq->start = res_mem->start +
-			 MCQ_SQATTR_OFFSET(hba->mcq_capabilities);
-	res_mcq->end = res_mcq->start + hba->nr_hw_queues * MCQ_QCFG_SIZE - 1;
-	res_mcq->flags = res_mem->flags;
-	res_mcq->name = "mcq";
-
-	ret = insert_resource(&iomem_resource, res_mcq);
-	if (ret) {
-		dev_err(hba->dev, "Failed to insert MCQ resource, err=%d\n",
-			ret);
-		return ret;
-	}
-
-	res->base = devm_ioremap_resource(hba->dev, res_mcq);
-	if (IS_ERR(res->base)) {
-		dev_err(hba->dev, "MCQ registers mapping failed, err=%d\n",
-			(int)PTR_ERR(res->base));
-		ret = PTR_ERR(res->base);
-		goto ioremap_err;
+	hba->mcq_base = devm_ioremap_resource(hba->dev, res);
+	if (IS_ERR(hba->mcq_base)) {
+		dev_err(hba->dev, "Failed to map MCQ region: %ld\n",
+			PTR_ERR(hba->mcq_base));
+		return PTR_ERR(hba->mcq_base);
 	}
 
-out:
-	hba->mcq_base = res->base;
 	return 0;
-ioremap_err:
-	res->base = NULL;
-	remove_resource(res_mcq);
-	return ret;
 }
 
 static int ufs_qcom_op_runtime_config(struct ufs_hba *hba)
 {
-	struct ufshcd_res_info *mem_res, *sqdao_res;
 	struct ufshcd_mcq_opr_info_t *opr;
 	int i;
+	u32 doorbell_offsets[OPR_MAX];
 
-	mem_res = &hba->res[RES_UFS];
-	sqdao_res = &hba->res[RES_MCQ_SQD];
+	/*
+	 * Configure doorbell address offsets in MCQ configuration registers.
+	 * These values are offsets relative to mmio_base (UFS_HCI_BASE).
+	 *
+	 * Memory Layout:
+	 * - mmio_base = UFS_HCI_BASE
+	 * - mcq_base  = MCQ_CONFIG_BASE = mmio_base + (UFS_QCOM_MCQCAP_QCFGPTR * 0x200)
+	 * - Doorbell registers are at: mmio_base + (UFS_QCOM_MCQCAP_QCFGPTR * 0x200) +
+	 * -				UFS_QCOM_MCQ_SQD_OFFSET
+	 * - Which is also: mcq_base +  UFS_QCOM_MCQ_SQD_OFFSET
+	 */
 
-	if (!mem_res->base || !sqdao_res->base)
-		return -EINVAL;
+	doorbell_offsets[OPR_SQD] = UFS_QCOM_SQD_ADDR_OFFSET;
+	doorbell_offsets[OPR_SQIS] = UFS_QCOM_SQIS_ADDR_OFFSET;
+	doorbell_offsets[OPR_CQD] = UFS_QCOM_CQD_ADDR_OFFSET;
+	doorbell_offsets[OPR_CQIS] = UFS_QCOM_CQIS_ADDR_OFFSET;
 
+	/*
+	 * Configure MCQ operation registers.
+	 *
+	 * The doorbell registers are physically located within the MCQ region:
+	 * - doorbell_physical_addr = mmio_base + doorbell_offset
+	 * - doorbell_physical_addr = mcq_base + (doorbell_offset - MCQ_CONFIG_OFFSET)
+	 */
 	for (i = 0; i < OPR_MAX; i++) {
 		opr = &hba->mcq_opr[i];
-		opr->offset = sqdao_res->resource->start -
-			      mem_res->resource->start + 0x40 * i;
-		opr->stride = 0x100;
-		opr->base = sqdao_res->base + 0x40 * i;
+		opr->offset = doorbell_offsets[i];  /* Offset relative to mmio_base */
+		opr->stride = UFS_QCOM_MCQ_STRIDE;  /* 256 bytes between queues */
+
+		/*
+		 * Calculate the actual doorbell base address within MCQ region:
+		 * base = mcq_base + (doorbell_offset - MCQ_CONFIG_OFFSET)
+		 */
+		opr->base = hba->mcq_base + (opr->offset - UFS_QCOM_MCQ_CONFIG_OFFSET);
 	}
 
 	return 0;
@@ -2034,12 +2030,8 @@ static int ufs_qcom_get_hba_mac(struct ufs_hba *hba)
 static int ufs_qcom_get_outstanding_cqs(struct ufs_hba *hba,
 					unsigned long *ocqs)
 {
-	struct ufshcd_res_info *mcq_vs_res = &hba->res[RES_MCQ_VS];
-
-	if (!mcq_vs_res->base)
-		return -EINVAL;
-
-	*ocqs = readl(mcq_vs_res->base + UFS_MEM_CQIS_VS);
+	/* Read from MCQ vendor-specific register in MCQ region */
+	*ocqs = readl(hba->mcq_base + UFS_MEM_CQIS_VS);
 
 	return 0;
 }
diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h
index e0e129af7c16..380d02333d38 100644
--- a/drivers/ufs/host/ufs-qcom.h
+++ b/drivers/ufs/host/ufs-qcom.h
@@ -33,6 +33,28 @@
 #define DL_VS_CLK_CFG_MASK GENMASK(9, 0)
 #define DME_VS_CORE_CLK_CTRL_DME_HW_CGC_EN             BIT(9)
 
+/* Qualcomm MCQ Configuration */
+#define UFS_QCOM_MCQCAP_QCFGPTR     224  /* 0xE0 in hex */
+#define UFS_QCOM_MCQ_CONFIG_OFFSET  (UFS_QCOM_MCQCAP_QCFGPTR * 0x200)  /* 0x1C000 */
+
+/* Doorbell offsets within MCQ region (relative to MCQ_CONFIG_BASE) */
+#define UFS_QCOM_MCQ_SQD_OFFSET     0x5000
+#define UFS_QCOM_MCQ_CQD_OFFSET     0x5080
+#define UFS_QCOM_MCQ_SQIS_OFFSET    0x5040
+#define UFS_QCOM_MCQ_CQIS_OFFSET    0x50C0
+#define UFS_QCOM_MCQ_STRIDE         0x100
+
+/* Calculated doorbell address offsets (relative to mmio_base) */
+#define UFS_QCOM_SQD_ADDR_OFFSET    (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_SQD_OFFSET)
+#define UFS_QCOM_CQD_ADDR_OFFSET    (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_CQD_OFFSET)
+#define UFS_QCOM_SQIS_ADDR_OFFSET   (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_SQIS_OFFSET)
+#define UFS_QCOM_CQIS_ADDR_OFFSET   (UFS_QCOM_MCQ_CONFIG_OFFSET + UFS_QCOM_MCQ_CQIS_OFFSET)
+#define REG_UFS_MCQ_STRIDE          UFS_QCOM_MCQ_STRIDE
+
+/* MCQ Vendor specific address offsets (relative to MCQ_CONFIG_BASE) */
+#define UFS_MEM_VS_BASE 0x4000
+#define UFS_MEM_CQIS_VS 0x4008
+
 /* QCOM UFS host controller vendor specific registers */
 enum {
 	REG_UFS_SYS1CLK_1US                 = 0xC0,
@@ -60,7 +82,7 @@ enum {
 	UFS_AH8_CFG				= 0xFC,
 
 	UFS_RD_REG_MCQ				= 0xD00,
-
+	UFS_MEM_ICE_CFG				= 0x2600,
 	REG_UFS_MEM_ICE_CONFIG			= 0x260C,
 	REG_UFS_MEM_ICE_NUM_CORE		= 0x2664,
 
@@ -95,10 +117,6 @@ enum {
 	REG_UFS_SW_H8_EXIT_CNT			= 0x2710,
 };
 
-enum {
-	UFS_MEM_CQIS_VS		= 0x8,
-};
-
 #define UFS_CNTLR_2_x_x_VEN_REGS_OFFSET(x)	(0x000 + x)
 #define UFS_CNTLR_3_x_x_VEN_REGS_OFFSET(x)	(0x400 + x)
 
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index ffe5d1d2b215..c2dafb583cf5 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -430,6 +430,39 @@ int ufshcd_negotiate_pwr_params(const struct ufs_host_params *host_params,
 }
 EXPORT_SYMBOL_GPL(ufshcd_negotiate_pwr_params);
 
+/**
+ * ufshcd_parse_gear_limits - Parse DT-based gear and rate limits for UFS
+ * @hba: Pointer to UFS host bus adapter instance
+ * @host_params: Pointer to UFS host parameters structure to be updated
+ *
+ * This function reads optional device tree properties to apply
+ * platform-specific constraints.
+ *
+ * "limit-hs-gear": Specifies the max HS gear.
+ * "limit-gear-rate": Specifies the max High-Speed rate.
+ */
+void ufshcd_parse_gear_limits(struct ufs_hba *hba, struct ufs_host_params *host_params)
+{
+	struct device_node *np = hba->dev->of_node;
+	u32 hs_gear;
+	const char *hs_rate;
+
+	if (!of_property_read_u32(np, "limit-hs-gear", &hs_gear)) {
+		host_params->hs_tx_gear = hs_gear;
+		host_params->hs_rx_gear = hs_gear;
+	}
+
+	if (!of_property_read_string(np, "limit-gear-rate", &hs_rate)) {
+		if (!strcmp(hs_rate, "rate-a"))
+			host_params->hs_rate = PA_HS_MODE_A;
+		else if (!strcmp(hs_rate, "rate-b"))
+			host_params->hs_rate = PA_HS_MODE_B;
+		else
+			dev_warn(hba->dev, "Invalid rate: %s\n", hs_rate);
+	}
+}
+EXPORT_SYMBOL_GPL(ufshcd_parse_gear_limits);
+
 void ufshcd_init_host_params(struct ufs_host_params *host_params)
 {
 	*host_params = (struct ufs_host_params){
diff --git a/drivers/ufs/host/ufshcd-pltfrm.h b/drivers/ufs/host/ufshcd-pltfrm.h
index 3017f8e8f93c..0a18a8aed94d 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.h
+++ b/drivers/ufs/host/ufshcd-pltfrm.h
@@ -29,6 +29,7 @@ int ufshcd_negotiate_pwr_params(const struct ufs_host_params *host_params,
 				const struct ufs_pa_layer_attr *dev_max,
 				struct ufs_pa_layer_attr *agreed_pwr);
 void ufshcd_init_host_params(struct ufs_host_params *host_params);
+void ufshcd_parse_gear_limits(struct ufs_hba *hba, struct ufs_host_params *host_params);
 int ufshcd_pltfrm_init(struct platform_device *pdev,
 		       const struct ufs_hba_variant_ops *vops);
 void ufshcd_pltfrm_remove(struct platform_device *pdev);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index f5062061c408..c147145a6593 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -378,7 +378,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist
 		 * is initialized by the hardware. Explicitly check/unpoison it
 		 * depending on the direction.
 		 */
-		kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction);
+		kmsan_handle_dma(sg_phys(sg), sg->length, direction);
 		*addr = (dma_addr_t)sg_phys(sg);
 		return 0;
 	}
@@ -3157,7 +3157,7 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr,
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
 	if (!vq->use_dma_api) {
-		kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir);
+		kmsan_handle_dma(virt_to_phys(ptr), size, dir);
 		return (dma_addr_t)virt_to_phys(ptr);
 	}
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index da1a7d3d377c..dd7747a2de87 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -392,6 +392,25 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
 	}
 }
 
+static dma_addr_t xen_swiotlb_direct_map_resource(struct device *dev,
+						  phys_addr_t paddr,
+						  size_t size,
+						  enum dma_data_direction dir,
+						  unsigned long attrs)
+{
+	dma_addr_t dma_addr = paddr;
+
+	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
+		dev_err_once(dev,
+			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+		WARN_ON_ONCE(1);
+		return DMA_MAPPING_ERROR;
+	}
+
+	return dma_addr;
+}
+
 /*
  * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
@@ -426,5 +445,5 @@ const struct dma_map_ops xen_swiotlb_dma_ops = {
 	.alloc_pages_op = dma_common_alloc_pages,
 	.free_pages = dma_common_free_pages,
 	.max_mapping_size = swiotlb_max_mapping_size,
-	.map_resource = dma_direct_map_resource,
+	.map_resource = xen_swiotlb_direct_map_resource,
 };
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 1e36a12b88f7..5ace2511fec5 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -79,7 +79,7 @@ __bpf_kfunc void bpf_put_file(struct file *file)
  * pathname in *buf*, including the NUL termination character. On error, a
  * negative integer is returned.
  */
-__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
+__bpf_kfunc int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz)
 {
 	int len;
 	char *ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6aad6b65522b..621e0df097e3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5829,7 +5829,7 @@ struct btrfs_dir_list {
  * See process_dir_items_leaf() for details about why it is needed.
  * This is a recursive operation - if an existing dentry corresponds to a
  * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes
  * the dentries point to we do not acquire their VFS lock, otherwise lockdep
  * complains about the following circular lock dependency / possible deadlock:
  *
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 69133ec1fac2..f3f79c67add5 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -114,26 +114,21 @@ static int create_link(struct config_item *parent_item,
 }
 
 
-static int get_target(const char *symname, struct path *path,
-		      struct config_item **target, struct super_block *sb)
+static int get_target(const char *symname, struct config_item **target,
+		      struct super_block *sb)
 {
+	struct path path __free(path_put) = {};
 	int ret;
 
-	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
-	if (!ret) {
-		if (path->dentry->d_sb == sb) {
-			*target = configfs_get_config_item(path->dentry);
-			if (!*target) {
-				ret = -ENOENT;
-				path_put(path);
-			}
-		} else {
-			ret = -EPERM;
-			path_put(path);
-		}
-	}
-
-	return ret;
+	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+	if (ret)
+		return ret;
+	if (path.dentry->d_sb != sb)
+		return -EPERM;
+	*target = configfs_get_config_item(path.dentry);
+	if (!*target)
+		return -ENOENT;
+	return 0;
 }
 
 
@@ -141,7 +136,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		     struct dentry *dentry, const char *symname)
 {
 	int ret;
-	struct path path;
 	struct configfs_dirent *sd;
 	struct config_item *parent_item;
 	struct config_item *target_item = NULL;
@@ -188,7 +182,7 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	 *  AV, a thoroughly annoyed bastard.
 	 */
 	inode_unlock(dir);
-	ret = get_target(symname, &path, &target_item, dentry->d_sb);
+	ret = get_target(symname, &target_item, dentry->d_sb);
 	inode_lock(dir);
 	if (ret)
 		goto out_put;
@@ -210,7 +204,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	}
 
 	config_item_put(target_item);
-	path_put(&path);
 
 out_put:
 	config_item_put(parent_item);
diff --git a/fs/file_table.c b/fs/file_table.c
index 81c72576e548..b223d873e48b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -54,7 +54,7 @@ struct backing_file {
 
 #define backing_file(f) container_of(f, struct backing_file, file)
 
-struct path *backing_file_user_path(const struct file *f)
+const struct path *backing_file_user_path(const struct file *f)
 {
 	return &backing_file(f)->user_path;
 }
@@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	 * the respective member when opening the file.
 	 */
 	mutex_init(&f->f_pos_lock);
-	memset(&f->f_path, 0, sizeof(f->f_path));
+	memset(&f->__f_path, 0, sizeof(f->f_path));
 	memset(&f->f_ra, 0, sizeof(f->f_ra));
 
 	f->f_flags	= flags;
@@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
 static void file_init_path(struct file *file, const struct path *path,
 			   const struct file_operations *fop)
 {
-	file->f_path = *path;
+	file->__f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
 	file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
diff --git a/fs/internal.h b/fs/internal.h
index b5c62abefff4..9b2b4d116880 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -53,7 +53,7 @@ extern int finish_clean_context(struct fs_context *fc);
  * namei.c
  */
 extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
-			   struct path *path, struct path *root);
+			   struct path *path, const struct path *root);
 int do_rmdir(int dfd, struct filename *name);
 int do_unlinkat(int dfd, struct filename *name);
 int may_linkat(struct mnt_idmap *idmap, const struct path *link);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e80262a51884..d68afa196535 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -216,8 +216,7 @@ out_err:
 	if (warned++ == 0)
 		printk(KERN_WARNING
 			"lockd_up: makesock failed, error=%d\n", err);
-	svc_xprt_destroy_all(serv, net);
-	svc_rpcb_cleanup(serv, net);
+	svc_xprt_destroy_all(serv, net, true);
 	return err;
 }
 
@@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
 			nlm_shutdown_hosts_net(net);
 			cancel_delayed_work_sync(&ln->grace_period_end);
 			locks_end_grace(&ln->lockd_manager);
-			svc_xprt_destroy_all(serv, net);
-			svc_rpcb_cleanup(serv, net);
+			svc_xprt_destroy_all(serv, net, true);
 		}
 	} else {
 		pr_err("%s: no users! net=%x\n",
diff --git a/fs/namei.c b/fs/namei.c
index 507ca0d7878d..7377020a2cba 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2695,7 +2695,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 }
 
 int filename_lookup(int dfd, struct filename *name, unsigned flags,
-		    struct path *path, struct path *root)
+		    struct path *path, const struct path *root)
 {
 	int retval;
 	struct nameidata nd;
@@ -3651,8 +3651,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 	if (nd->flags & LOOKUP_DIRECTORY)
 		open_flag |= O_DIRECTORY;
 
-	file->f_path.dentry = DENTRY_NOT_SET;
-	file->f_path.mnt = nd->path.mnt;
+	file->__f_path.dentry = DENTRY_NOT_SET;
+	file->__f_path.mnt = nd->path.mnt;
 	error = dir->i_op->atomic_open(dir, dentry, file,
 				       open_to_namei_flags(open_flag), mode);
 	d_lookup_done(dentry);
@@ -4020,8 +4020,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
 	child = d_alloc(parentpath->dentry, &slash_name);
 	if (unlikely(!child))
 		return -ENOMEM;
-	file->f_path.mnt = parentpath->mnt;
-	file->f_path.dentry = child;
+	file->__f_path.mnt = parentpath->mnt;
+	file->__f_path.dentry = child;
 	mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
 	error = dir->i_op->tmpfile(idmap, dir, file, mode);
 	dput(child);
@@ -4256,7 +4256,7 @@ struct dentry *start_creating_path(int dfd, const char *pathname,
 }
 EXPORT_SYMBOL(start_creating_path);
 
-void end_creating_path(struct path *path, struct dentry *dentry)
+void end_creating_path(const struct path *path, struct dentry *dentry)
 {
 	if (!IS_ERR(dentry))
 		dput(dentry);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5d6edafbed20..0e4c67373e4f 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
 	struct pnfs_layout_segment *lseg;
 	struct xdr_buf buf;
 	struct xdr_stream xdr;
-	struct page *scratch;
+	struct folio *scratch;
 	int status, i;
 	uint32_t count;
 	__be32 *p;
@@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
 		return ERR_PTR(-ENOMEM);
 
 	status = -ENOMEM;
-	scratch = alloc_page(gfp_mask);
+	scratch = folio_alloc(gfp_mask, 0);
 	if (!scratch)
 		goto out;
 
 	xdr_init_decode_pages(&xdr, &buf,
 			lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_page(&xdr, scratch);
+	xdr_set_scratch_folio(&xdr, scratch);
 
 	status = -EIO;
 	p = xdr_inline_decode(&xdr, 4);
@@ -744,7 +744,7 @@ process_extents:
 	}
 
 out_free_scratch:
-	__free_page(scratch);
+	folio_put(scratch);
 out:
 	dprintk("%s returns %d\n", __func__, status);
 	switch (status) {
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 44306ac22353..ab76120705e2 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	struct pnfs_block_dev *top;
 	struct xdr_stream xdr;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct folio *scratch;
 	int nr_volumes, ret, i;
 	__be32 *p;
 
-	scratch = alloc_page(gfp_mask);
+	scratch = folio_alloc(gfp_mask, 0);
 	if (!scratch)
 		goto out;
 
 	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_page(&xdr, scratch);
+	xdr_set_scratch_folio(&xdr, scratch);
 
 	p = xdr_inline_decode(&xdr, sizeof(__be32));
 	if (!p)
@@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 out_free_volumes:
 	kfree(volumes);
 out_free_scratch:
-	__free_page(scratch);
+	folio_put(scratch);
 out:
 	return node;
 }
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 86bdc7d23fb9..c8b837006bb2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
 		return;
 
 	dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
-	svc_xprt_destroy_all(serv, net);
+	svc_xprt_destroy_all(serv, net, false);
 }
 
 static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
 	ret = svc_bind(serv, net);
 	if (ret < 0) {
 		printk(KERN_WARNING "NFS: bind callback service failed\n");
-		goto err_bind;
+		goto err;
 	}
 
 	ret = 0;
@@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
 
 	if (ret < 0) {
 		printk(KERN_ERR "NFS: callback service start failed\n");
-		goto err_socks;
+		goto err;
 	}
 	return 0;
 
-err_socks:
-	svc_rpcb_cleanup(serv, net);
-err_bind:
+err:
 	nn->cb_users[minorversion]--;
 	dprintk("NFS: Couldn't create callback socket: err = %d; "
 			"net = %x\n", ret, net->ns.inum);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5f7d9be6f022..46d9c65d50f8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
 	struct address_space *mapping = desc->file->f_mapping;
 	struct folio *new, *folio = *arrays;
 	struct xdr_stream stream;
-	struct page *scratch;
+	struct folio *scratch;
 	struct xdr_buf buf;
 	u64 cookie;
 	int status;
 
-	scratch = alloc_page(GFP_KERNEL);
+	scratch = folio_alloc(GFP_KERNEL, 0);
 	if (scratch == NULL)
 		return -ENOMEM;
 
 	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
-	xdr_set_scratch_page(&stream, scratch);
+	xdr_set_scratch_folio(&stream, scratch);
 
 	do {
 		status = nfs_readdir_entry_decode(desc, entry, &stream);
@@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc,
 	if (folio != *arrays)
 		nfs_readdir_folio_unlock_and_put(folio);
 
-	put_page(scratch);
+	folio_put(scratch);
 	return status;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8059ece82468..d020aab40c64 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t result;
 
+	trace_nfs_file_read(iocb, to);
+
 	if (iocb->ki_flags & IOCB_DIRECT)
 		return nfs_file_direct_read(iocb, to, false);
 
@@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
 
 	if (pnfs_ld_read_whole_page(file_inode(file)))
 		return true;
+	if (folio_test_dropbehind(folio))
+		return false;
 	/* Open for reading too? */
 	if (file->f_mode & FMODE_READ)
 		return true;
@@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb,
 			   loff_t pos, unsigned len, struct folio **foliop,
 			   void **fsdata)
 {
-	fgf_t fgp = FGP_WRITEBEGIN;
 	struct folio *folio;
 	struct file *file = iocb->ki_filp;
 	int once_thru = 0;
 	int ret;
 
+	trace_nfs_write_begin(file_inode(file), pos, len);
+
 	dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
 		file, mapping->host->i_ino, len, (long long) pos);
 	nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos);
 
-	fgp |= fgf_set_order(len);
 start:
-	folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp,
-				    mapping_gfp_mask(mapping));
-	if (IS_ERR(folio))
-		return PTR_ERR(folio);
+	folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len);
+	if (IS_ERR(folio)) {
+		ret = PTR_ERR(folio);
+		goto out;
+	}
 	*foliop = folio;
 
 	ret = nfs_flush_incompatible(file, folio);
@@ -405,11 +410,14 @@ start:
 	} else if (!once_thru &&
 		   nfs_want_read_modify_write(file, folio, pos, len)) {
 		once_thru = 1;
+		folio_clear_dropbehind(folio);
 		ret = nfs_read_folio(file, folio);
 		folio_put(folio);
 		if (!ret)
 			goto start;
 	}
+out:
+	trace_nfs_write_begin_done(file_inode(file), pos, len, ret);
 	return ret;
 }
 
@@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb,
 	unsigned offset = offset_in_folio(folio, pos);
 	int status;
 
+	trace_nfs_write_end(file_inode(file), pos, len);
 	dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
 		file, mapping->host->i_ino, len, (long long) pos);
 
@@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (status < 0)
+	if (status < 0) {
+		trace_nfs_write_end_done(file_inode(file), pos, len, status);
 		return status;
+	}
 	NFS_I(mapping->host)->write_io += copied;
 
 	if (nfs_ctx_key_to_expire(ctx, mapping->host))
 		nfs_wb_all(mapping->host);
 
+	trace_nfs_write_end_done(file_inode(file), pos, len, copied);
 	return copied;
 }
 
@@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 	errseq_t since;
 	int error;
 
+	trace_nfs_file_write(iocb, from);
+
 	result = nfs_key_timeout_notify(file, inode);
 	if (result)
 		return result;
@@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = {
 	.splice_write	= iter_file_splice_write,
 	.check_flags	= nfs_check_flags,
 	.setlease	= simple_nosetlease,
+	.fop_flags	= FOP_DONTCACHE,
 };
 EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d39a1f58e18d..5c4551117c58 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 {
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct folio *scratch;
 	__be32 *p;
 	uint32_t nfl_util;
 	int i;
 
 	dprintk("%s: set_layout_map Begin\n", __func__);
 
-	scratch = alloc_page(gfp_flags);
+	scratch = folio_alloc(gfp_flags, 0);
 	if (!scratch)
 		return -ENOMEM;
 
 	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
-	xdr_set_scratch_page(&stream, scratch);
+	xdr_set_scratch_folio(&stream, scratch);
 
 	/* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
 	 * num_fh (4) */
@@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
 			fl->fh_array[i]->size);
 	}
 
-	__free_page(scratch);
+	folio_put(scratch);
 	return 0;
 
 out_err:
-	__free_page(scratch);
+	folio_put(scratch);
 	return -EIO;
 }
 
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 29d9234d5c08..df79aeb68db4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct folio *scratch;
 	struct list_head dsaddrs;
 	struct nfs4_pnfs_ds_addr *da;
 	struct net *net = server->nfs_client->cl_net;
 
 	/* set up xdr stream */
-	scratch = alloc_page(gfp_flags);
+	scratch = folio_alloc(gfp_flags, 0);
 	if (!scratch)
 		goto out_err;
 
 	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_page(&stream, scratch);
+	xdr_set_scratch_folio(&stream, scratch);
 
 	/* Get the stripe count (number of stripe index) */
 	p = xdr_inline_decode(&stream, 4);
@@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 		}
 	}
 
-	__free_page(scratch);
+	folio_put(scratch);
 	return dsaddr;
 
 out_err_drain_dsaddrs:
@@ -204,7 +204,7 @@ out_err_free_deviceid:
 out_err_free_stripe_indices:
 	kfree(stripe_indices);
 out_err_free_scratch:
-	__free_page(scratch);
+	folio_put(scratch);
 out_err:
 	dprintk("%s ERROR: returning NULL\n", __func__);
 	return NULL;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 9edb5f9b0c4e..df01d2876b68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
 			       int dev_limit, enum nfs4_ff_op_type type);
 static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
 			      const struct nfs42_layoutstat_devinfo *devinfo,
-			      struct nfs4_ff_layout_mirror *mirror);
+			      struct nfs4_ff_layout_ds_stripe *dss_info);
 
 static struct pnfs_layout_hdr *
 ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
@@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id)
 }
 
 static struct nfsd_file *
-ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id,
 		 struct nfs_client *clp, const struct cred *cred,
 		 struct nfs_fh *fh, fmode_t mode)
 {
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 
-	return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode);
+	return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode);
 #else
 	return NULL;
 #endif
 }
 
-static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
-		const struct nfs4_ff_layout_mirror *m2)
+static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1,
+		const struct nfs4_ff_layout_ds_stripe *dss2)
 {
 	int i, j;
 
-	if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+	if (dss1->fh_versions_cnt != dss2->fh_versions_cnt)
 		return false;
-	for (i = 0; i < m1->fh_versions_cnt; i++) {
+
+	for (i = 0; i < dss1->fh_versions_cnt; i++) {
 		bool found_fh = false;
-		for (j = 0; j < m2->fh_versions_cnt; j++) {
-			if (nfs_compare_fh(&m1->fh_versions[i],
-					&m2->fh_versions[j]) == 0) {
+		for (j = 0; j < dss2->fh_versions_cnt; j++) {
+			if (nfs_compare_fh(&dss1->fh_versions[i],
+					&dss2->fh_versions[j]) == 0) {
 				found_fh = true;
 				break;
 			}
@@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
 	return true;
 }
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+		const struct nfs4_ff_layout_mirror *m2)
+{
+	u32 dss_id;
+
+	if (m1->dss_count != m2->dss_count)
+		return false;
+
+	for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+		if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id]))
+			return false;
+
+	return true;
+}
+
+static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1,
+		const struct nfs4_ff_layout_mirror *m2)
+{
+	u32 dss_id;
+
+	if (m1->dss_count != m2->dss_count)
+		return false;
+
+	for (dss_id = 0; dss_id < m1->dss_count; dss_id++)
+		if (memcmp(&m1->dss[dss_id].devid,
+			   &m2->dss[dss_id].devid,
+			   sizeof(m1->dss[dss_id].devid)) != 0)
+			return false;
+
+	return true;
+}
+
 static struct nfs4_ff_layout_mirror *
 ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 		struct nfs4_ff_layout_mirror *mirror)
@@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 
 	spin_lock(&inode->i_lock);
 	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
-		if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
+		if (!ff_mirror_match_devid(mirror, pos))
 			continue;
 		if (!ff_mirror_match_fh(mirror, pos))
 			continue;
@@ -240,29 +273,37 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
 static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 {
 	struct nfs4_ff_layout_mirror *mirror;
+	u32 dss_id;
 
 	mirror = kzalloc(sizeof(*mirror), gfp_flags);
 	if (mirror != NULL) {
 		spin_lock_init(&mirror->lock);
 		refcount_set(&mirror->ref, 1);
 		INIT_LIST_HEAD(&mirror->mirrors);
-		nfs_localio_file_init(&mirror->nfl);
+		for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+			nfs_localio_file_init(&mirror->dss[dss_id].nfl);
 	}
 	return mirror;
 }
 
 static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 {
-	const struct cred *cred;
+	const struct cred	*cred;
+	u32 dss_id;
 
 	ff_layout_remove_mirror(mirror);
-	kfree(mirror->fh_versions);
-	nfs_close_local_fh(&mirror->nfl);
-	cred = rcu_access_pointer(mirror->ro_cred);
-	put_cred(cred);
-	cred = rcu_access_pointer(mirror->rw_cred);
-	put_cred(cred);
-	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+
+	for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+		kfree(mirror->dss[dss_id].fh_versions);
+		cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred);
+		put_cred(cred);
+		cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred);
+		put_cred(cred);
+		nfs_close_local_fh(&mirror->dss[dss_id].nfl);
+		nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds);
+	}
+
+	kfree(mirror->dss);
 	kfree(mirror);
 }
 
@@ -366,14 +407,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
 			free_me);
 }
 
+static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror)
+{
+	u32 dss_id, sum = 0;
+
+	for (dss_id = 0; dss_id < mirror->dss_count; dss_id++)
+		sum += mirror->dss[dss_id].efficiency;
+
+	return sum;
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
 	int i, j;
 
 	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
 		for (j = i + 1; j < fls->mirror_array_cnt; j++)
-			if (fls->mirror_array[i]->efficiency <
-			    fls->mirror_array[j]->efficiency)
+			if (ff_mirror_efficiency_sum(fls->mirror_array[i]) <
+			    ff_mirror_efficiency_sum(fls->mirror_array[j]))
 				swap(fls->mirror_array[i],
 				     fls->mirror_array[j]);
 	}
@@ -388,20 +439,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 	struct nfs4_ff_layout_segment *fls = NULL;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct folio *scratch;
 	u64 stripe_unit;
 	u32 mirror_array_cnt;
 	__be32 *p;
 	int i, rc;
+	struct nfs4_ff_layout_ds_stripe *dss_info;
 
 	dprintk("--> %s\n", __func__);
-	scratch = alloc_page(gfp_flags);
+	scratch = folio_alloc(gfp_flags, 0);
 	if (!scratch)
 		return ERR_PTR(-ENOMEM);
 
 	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
 			      lgr->layoutp->len);
-	xdr_set_scratch_page(&stream, scratch);
+	xdr_set_scratch_folio(&stream, scratch);
 
 	/* stripe unit and mirror_array_cnt */
 	rc = -EIO;
@@ -427,23 +479,32 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 	fls->mirror_array_cnt = mirror_array_cnt;
 	fls->stripe_unit = stripe_unit;
 
+	u32 dss_count = 0;
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
 		struct nfs4_ff_layout_mirror *mirror;
 		struct cred *kcred;
 		const struct cred __rcu *cred;
 		kuid_t uid;
 		kgid_t gid;
-		u32 ds_count, fh_count, id;
-		int j;
+		u32 fh_count, id;
+		int j, dss_id;
 
 		rc = -EIO;
 		p = xdr_inline_decode(&stream, 4);
 		if (!p)
 			goto out_err_free;
-		ds_count = be32_to_cpup(p);
 
-		/* FIXME: allow for striping? */
-		if (ds_count != 1)
+		// Ensure all mirrors have same stripe count.
+		if (dss_count == 0)
+			dss_count = be32_to_cpup(p);
+		else if (dss_count != be32_to_cpup(p))
+			goto out_err_free;
+
+		if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT ||
+		    dss_count == 0)
+			goto out_err_free;
+
+		if (dss_count > 1 && stripe_unit == 0)
 			goto out_err_free;
 
 		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
@@ -452,91 +513,105 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 			goto out_err_free;
 		}
 
-		fls->mirror_array[i]->ds_count = ds_count;
+		fls->mirror_array[i]->dss_count = dss_count;
+		fls->mirror_array[i]->dss =
+		    kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe),
+			    gfp_flags);
 
-		/* deviceid */
-		rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
-		if (rc)
-			goto out_err_free;
+		for (dss_id = 0; dss_id < dss_count; dss_id++) {
+			dss_info = &fls->mirror_array[i]->dss[dss_id];
+			dss_info->mirror = fls->mirror_array[i];
 
-		/* efficiency */
-		rc = -EIO;
-		p = xdr_inline_decode(&stream, 4);
-		if (!p)
-			goto out_err_free;
-		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+			/* deviceid */
+			rc = decode_deviceid(&stream, &dss_info->devid);
+			if (rc)
+				goto out_err_free;
 
-		/* stateid */
-		rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
-		if (rc)
-			goto out_err_free;
+			/* efficiency */
+			rc = -EIO;
+			p = xdr_inline_decode(&stream, 4);
+			if (!p)
+				goto out_err_free;
+			dss_info->efficiency = be32_to_cpup(p);
 
-		/* fh */
-		rc = -EIO;
-		p = xdr_inline_decode(&stream, 4);
-		if (!p)
-			goto out_err_free;
-		fh_count = be32_to_cpup(p);
+			/* stateid */
+			rc = decode_pnfs_stateid(&stream, &dss_info->stateid);
+			if (rc)
+				goto out_err_free;
 
-		fls->mirror_array[i]->fh_versions =
-			kcalloc(fh_count, sizeof(struct nfs_fh),
-				gfp_flags);
-		if (fls->mirror_array[i]->fh_versions == NULL) {
-			rc = -ENOMEM;
-			goto out_err_free;
-		}
+			/* fh */
+			rc = -EIO;
+			p = xdr_inline_decode(&stream, 4);
+			if (!p)
+				goto out_err_free;
+			fh_count = be32_to_cpup(p);
 
-		for (j = 0; j < fh_count; j++) {
-			rc = decode_nfs_fh(&stream,
-					   &fls->mirror_array[i]->fh_versions[j]);
+			dss_info->fh_versions =
+			    kcalloc(fh_count, sizeof(struct nfs_fh),
+				    gfp_flags);
+			if (dss_info->fh_versions == NULL) {
+				rc = -ENOMEM;
+				goto out_err_free;
+			}
+
+			for (j = 0; j < fh_count; j++) {
+				rc = decode_nfs_fh(&stream,
+						   &dss_info->fh_versions[j]);
+				if (rc)
+					goto out_err_free;
+			}
+
+			dss_info->fh_versions_cnt = fh_count;
+
+			/* user */
+			rc = decode_name(&stream, &id);
 			if (rc)
 				goto out_err_free;
-		}
 
-		fls->mirror_array[i]->fh_versions_cnt = fh_count;
+			uid = make_kuid(&init_user_ns, id);
 
-		/* user */
-		rc = decode_name(&stream, &id);
-		if (rc)
-			goto out_err_free;
+			/* group */
+			rc = decode_name(&stream, &id);
+			if (rc)
+				goto out_err_free;
 
-		uid = make_kuid(&init_user_ns, id);
+			gid = make_kgid(&init_user_ns, id);
 
-		/* group */
-		rc = decode_name(&stream, &id);
-		if (rc)
-			goto out_err_free;
+			if (gfp_flags & __GFP_FS)
+				kcred = prepare_kernel_cred(&init_task);
+			else {
+				unsigned int nofs_flags = memalloc_nofs_save();
 
-		gid = make_kgid(&init_user_ns, id);
+				kcred = prepare_kernel_cred(&init_task);
+				memalloc_nofs_restore(nofs_flags);
+			}
+			rc = -ENOMEM;
+			if (!kcred)
+				goto out_err_free;
+			kcred->fsuid = uid;
+			kcred->fsgid = gid;
+			cred = RCU_INITIALIZER(kcred);
 
-		if (gfp_flags & __GFP_FS)
-			kcred = prepare_kernel_cred(&init_task);
-		else {
-			unsigned int nofs_flags = memalloc_nofs_save();
-			kcred = prepare_kernel_cred(&init_task);
-			memalloc_nofs_restore(nofs_flags);
+			if (lgr->range.iomode == IOMODE_READ)
+				rcu_assign_pointer(dss_info->ro_cred, cred);
+			else
+				rcu_assign_pointer(dss_info->rw_cred, cred);
 		}
-		rc = -ENOMEM;
-		if (!kcred)
-			goto out_err_free;
-		kcred->fsuid = uid;
-		kcred->fsgid = gid;
-		cred = RCU_INITIALIZER(kcred);
-
-		if (lgr->range.iomode == IOMODE_READ)
-			rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
-		else
-			rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
 
 		mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
 		if (mirror != fls->mirror_array[i]) {
-			/* swap cred ptrs so free_mirror will clean up old */
-			if (lgr->range.iomode == IOMODE_READ) {
-				cred = xchg(&mirror->ro_cred, cred);
-				rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
-			} else {
-				cred = xchg(&mirror->rw_cred, cred);
-				rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
+			for (dss_id = 0; dss_id < dss_count; dss_id++) {
+				dss_info = &fls->mirror_array[i]->dss[dss_id];
+				/* swap cred ptrs so free_mirror will clean up old */
+				if (lgr->range.iomode == IOMODE_READ) {
+					cred = xchg(&mirror->dss[dss_id].ro_cred,
+						    dss_info->ro_cred);
+					rcu_assign_pointer(dss_info->ro_cred, cred);
+				} else {
+					cred = xchg(&mirror->dss[dss_id].rw_cred,
+						    dss_info->rw_cred);
+					rcu_assign_pointer(dss_info->rw_cred, cred);
+				}
 			}
 			ff_layout_free_mirror(fls->mirror_array[i]);
 			fls->mirror_array[i] = mirror;
@@ -564,7 +639,7 @@ out_sort_mirrors:
 	ret = &fls->generic_hdr;
 	dprintk("<-- %s (success)\n", __func__);
 out_free_page:
-	__free_page(scratch);
+	folio_put(scratch);
 	return ret;
 out_err_free:
 	_ff_layout_free_lseg(fls);
@@ -593,6 +668,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 	_ff_layout_free_lseg(fls);
 }
 
+static u32 calc_commit_idx(struct pnfs_layout_segment *lseg,
+			   u32 mirror_idx, u32 dss_id)
+{
+	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+	return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id;
+}
+
+static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg,
+				       u32 commit_index)
+{
+	return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
+static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg,
+				   u32 commit_index)
+{
+	return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count;
+}
+
 static void
 nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 {
@@ -617,6 +712,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 
 static bool
 nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+			    u32 dss_id,
 			    struct nfs4_ff_layoutstat *layoutstat,
 			    ktime_t now)
 {
@@ -624,8 +720,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
 
 	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
-	if (!mirror->start_time)
-		mirror->start_time = now;
+	if (!mirror->dss[dss_id].start_time)
+		mirror->dss[dss_id].start_time = now;
 	if (mirror->report_interval != 0)
 		report_interval = (s64)mirror->report_interval * 1000LL;
 	else if (layoutstats_timer != 0)
@@ -675,13 +771,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 static void
 nfs4_ff_layout_stat_io_start_read(struct inode *inode,
 		struct nfs4_ff_layout_mirror *mirror,
+		u32 dss_id,
 		__u64 requested, ktime_t now)
 {
 	bool report;
 
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
-	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+	report = nfs4_ff_layoutstat_start_io(
+		mirror, dss_id, &mirror->dss[dss_id].read_stat, now);
+	nfs4_ff_layout_stat_io_update_requested(
+		&mirror->dss[dss_id].read_stat, requested);
 	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 
@@ -692,11 +791,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
 static void
 nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 		struct nfs4_ff_layout_mirror *mirror,
+		u32 dss_id,
 		__u64 requested,
 		__u64 completed)
 {
 	spin_lock(&mirror->lock);
-	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+	nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat,
 			requested, completed,
 			ktime_get(), task->tk_start);
 	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
@@ -706,13 +806,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 static void
 nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 		struct nfs4_ff_layout_mirror *mirror,
+		u32 dss_id,
 		__u64 requested, ktime_t now)
 {
 	bool report;
 
 	spin_lock(&mirror->lock);
-	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
-	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+	report = nfs4_ff_layoutstat_start_io(
+		mirror,
+		dss_id,
+		&mirror->dss[dss_id].write_stat,
+		now);
+	nfs4_ff_layout_stat_io_update_requested(
+		&mirror->dss[dss_id].write_stat,
+		requested);
 	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 
@@ -723,6 +830,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 static void
 nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 		struct nfs4_ff_layout_mirror *mirror,
+		u32 dss_id,
 		__u64 requested,
 		__u64 completed,
 		enum nfs3_stable_how committed)
@@ -731,25 +839,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 		requested = completed = 0;
 
 	spin_lock(&mirror->lock);
-	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+	nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat,
 			requested, completed, ktime_get(), task->tk_start);
 	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 }
 
 static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
 {
-	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
 
 	if (devid)
 		nfs4_mark_deviceid_unavailable(devid);
 }
 
 static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
 {
-	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
 
 	if (devid)
 		nfs4_mark_deviceid_available(devid);
@@ -758,6 +866,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
 static struct nfs4_pnfs_ds *
 ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 			     u32 start_idx, u32 *best_idx,
+			     u32 offset, u32 *dss_id,
 			     bool check_device)
 {
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
@@ -768,12 +877,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 	/* mirrors are initially sorted by efficiency */
 	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+		*dss_id = nfs4_ff_layout_calc_dss_id(
+			fls->stripe_unit,
+			fls->mirror_array[idx]->dss_count,
+			offset);
+		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false);
 		if (IS_ERR(ds))
 			continue;
 
 		if (check_device &&
-		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) {
+		    nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) {
 			// reinitialize the error state in case if this is the last iteration
 			ds = ERR_PTR(-EINVAL);
 			continue;
@@ -788,42 +901,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
-				 u32 start_idx, u32 *best_idx)
+				 u32 start_idx, u32 *best_idx,
+				 u32 offset, u32 *dss_id)
 {
-	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
+	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+					    offset, dss_id, false);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
-				   u32 start_idx, u32 *best_idx)
+				   u32 start_idx, u32 *best_idx,
+				   u32 offset, u32 *dss_id)
 {
-	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
+	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx,
+					    offset, dss_id, true);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
-				  u32 start_idx, u32 *best_idx)
+				  u32 start_idx, u32 *best_idx,
+				  u32 offset, u32 *dss_id)
 {
 	struct nfs4_pnfs_ds *ds;
 
-	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
+	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx,
+						offset, dss_id);
 	if (!IS_ERR(ds))
 		return ds;
-	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
+	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx,
+						offset, dss_id);
 }
 
 static struct nfs4_pnfs_ds *
 ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
-			  u32 *best_idx)
+			  u32 *best_idx,
+			  u32 offset,
+			  u32 *dss_id)
 {
 	struct pnfs_layout_segment *lseg = pgio->pg_lseg;
 	struct nfs4_pnfs_ds *ds;
 
 	ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
-					       best_idx);
+					       best_idx, offset, dss_id);
 	if (!IS_ERR(ds) || !pgio->pg_mirror_idx)
 		return ds;
-	return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
+	return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx,
+						 offset, dss_id);
 }
 
 static void
@@ -842,6 +965,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
 	}
 }
 
+static bool
+ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls)
+{
+	return fls->mirror_array[0]->dss_count > 1;
+}
+
+/*
+ * ff_layout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
+ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		  struct nfs_page *req)
+{
+	unsigned int size;
+	u64 p_stripe, r_stripe;
+	u32 stripe_offset;
+	u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+	u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+	/* calls nfs_generic_pg_test */
+	size = pnfs_generic_pg_test(pgio, prev, req);
+	if (!size)
+		return 0;
+	else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg)))
+		return size;
+
+	/* see if req and prev are in the same stripe */
+	if (prev) {
+		p_stripe = (u64)req_offset(prev) - segment_offset;
+		r_stripe = (u64)req_offset(req) - segment_offset;
+		do_div(p_stripe, stripe_unit);
+		do_div(r_stripe, stripe_unit);
+
+		if (p_stripe != r_stripe)
+			return 0;
+	}
+
+	/* calculate remaining bytes in the current stripe */
+	div_u64_rem((u64)req_offset(req) - segment_offset,
+			stripe_unit,
+			&stripe_offset);
+	WARN_ON_ONCE(stripe_offset > stripe_unit);
+	if (stripe_offset >= stripe_unit)
+		return 0;
+	return min(stripe_unit - (unsigned int)stripe_offset, size);
+}
+
 static void
 ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 			struct nfs_page *req)
@@ -849,7 +1022,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	struct nfs_pgio_mirror *pgm;
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_pnfs_ds *ds;
-	u32 ds_idx;
+	u32 ds_idx, dss_id;
 
 	if (NFS_SERVER(pgio->pg_inode)->flags &
 			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
@@ -870,7 +1043,8 @@ retry:
 	/* Reset wb_nio, since getting layout segment was successful */
 	req->wb_nio = 0;
 
-	ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
+	ds = ff_layout_get_ds_for_read(pgio, &ds_idx,
+				       req_offset(req), &dss_id);
 	if (IS_ERR(ds)) {
 		if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 			goto out_mds;
@@ -882,7 +1056,7 @@ retry:
 
 	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 	pgm = &pgio->pg_mirrors[0];
-	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+	pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize;
 
 	pgio->pg_mirror_idx = ds_idx;
 	return;
@@ -919,7 +1093,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs_pgio_mirror *pgm;
 	struct nfs4_pnfs_ds *ds;
-	u32 i;
+	u32 i, dss_id;
 
 retry:
 	pnfs_generic_pg_check_layout(pgio, req);
@@ -944,7 +1118,12 @@ retry:
 
 	for (i = 0; i < pgio->pg_mirror_count; i++) {
 		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
-		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
+		dss_id = nfs4_ff_layout_calc_dss_id(
+			FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit,
+			mirror->dss_count,
+			req_offset(req));
+		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror,
+					       dss_id, true);
 		if (IS_ERR(ds)) {
 			if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 				goto out_mds;
@@ -954,7 +1133,7 @@ retry:
 			goto retry;
 		}
 		pgm = &pgio->pg_mirrors[i];
-		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+		pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize;
 	}
 
 	if (NFS_SERVER(pgio->pg_inode)->flags &
@@ -1020,14 +1199,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
 
 static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
 	.pg_init = ff_layout_pg_init_read,
-	.pg_test = pnfs_generic_pg_test,
+	.pg_test = ff_layout_pg_test,
 	.pg_doio = pnfs_generic_pg_readpages,
 	.pg_cleanup = pnfs_generic_pg_cleanup,
 };
 
 static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
 	.pg_init = ff_layout_pg_init_write,
-	.pg_test = pnfs_generic_pg_test,
+	.pg_test = ff_layout_pg_test,
 	.pg_doio = pnfs_generic_pg_writepages,
 	.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
 	.pg_cleanup = pnfs_generic_pg_cleanup,
@@ -1075,9 +1254,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
 {
 	u32 idx = hdr->pgio_mirror_idx + 1;
 	u32 new_idx = 0;
+	u32 dss_id = 0;
 	struct nfs4_pnfs_ds *ds;
 
-	ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx);
+	ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx,
+					      hdr->args.offset, &dss_id);
 	if (IS_ERR(ds))
 		pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
 	else
@@ -1114,11 +1295,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 					   struct nfs4_state *state,
 					   struct nfs_client *clp,
 					   struct pnfs_layout_segment *lseg,
-					   u32 idx)
+					   u32 idx, u32 dss_id)
 {
 	struct pnfs_layout_hdr *lo = lseg->pls_layout;
 	struct inode *inode = lo->plh_inode;
-	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
 	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 
 	switch (op_status) {
@@ -1215,9 +1396,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 					   u32 op_status,
 					   struct nfs_client *clp,
 					   struct pnfs_layout_segment *lseg,
-					   u32 idx)
+					   u32 idx, u32 dss_id)
 {
-	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id);
 
 	switch (op_status) {
 	case NFS_OK:
@@ -1281,12 +1462,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 					struct nfs4_state *state,
 					struct nfs_client *clp,
 					struct pnfs_layout_segment *lseg,
-					u32 idx)
+					u32 idx, u32 dss_id)
 {
 	int vers = clp->cl_nfs_mod->rpc_vers->number;
 
 	if (task->tk_status >= 0) {
-		ff_layout_mark_ds_reachable(lseg, idx);
+		ff_layout_mark_ds_reachable(lseg, idx, dss_id);
 		return 0;
 	}
 
@@ -1297,10 +1478,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 	switch (vers) {
 	case 3:
 		return ff_layout_async_handle_error_v3(task, op_status, clp,
-						       lseg, idx);
+						       lseg, idx, dss_id);
 	case 4:
 		return ff_layout_async_handle_error_v4(task, op_status, state,
-						       clp, lseg, idx);
+						       clp, lseg, idx, dss_id);
 	default:
 		/* should never happen */
 		WARN_ON_ONCE(1);
@@ -1309,7 +1490,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 }
 
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
-					u32 idx, u64 offset, u64 length,
+					u32 idx, u32 dss_id, u64 offset, u64 length,
 					u32 *op_status, int opnum, int error)
 {
 	struct nfs4_ff_layout_mirror *mirror;
@@ -1347,7 +1528,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
-				       mirror, offset, length, status, opnum,
+				       mirror, dss_id, offset, length, status, opnum,
 				       nfs_io_gfp_mask());
 
 	switch (status) {
@@ -1356,7 +1537,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 	case NFS4ERR_PERM:
 		break;
 	case NFS4ERR_NXIO:
-		ff_layout_mark_ds_unreachable(lseg, idx);
+		ff_layout_mark_ds_unreachable(lseg, idx, dss_id);
 		/*
 		 * Don't return the layout if this is a read and we still
 		 * have layouts to try
@@ -1376,10 +1557,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 static int ff_layout_read_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+	u32 dss_id = nfs4_ff_layout_calc_dss_id(
+		flseg->stripe_unit,
+		flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+		hdr->args.offset);
 	int err;
 
 	if (task->tk_status < 0) {
-		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+		ff_layout_io_track_ds_error(hdr->lseg,
+					    hdr->pgio_mirror_idx, dss_id,
 					    hdr->args.offset, hdr->args.count,
 					    &hdr->res.op_status, OP_READ,
 					    task->tk_status);
@@ -1389,7 +1576,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 	err = ff_layout_async_handle_error(task, hdr->res.op_status,
 					   hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
-					   hdr->pgio_mirror_idx);
+					   hdr->pgio_mirror_idx,
+					   dss_id);
 
 	trace_nfs4_pnfs_read(hdr, err);
 	clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1445,23 +1633,47 @@ ff_layout_set_layoutcommit(struct inode *inode,
 static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_mirror *mirror;
+	u32 dss_id;
+
 	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
 		return;
-	nfs4_ff_layout_stat_io_start_read(hdr->inode,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count,
-			task->tk_start);
+
+	mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+		mirror->dss_count,
+		hdr->args.offset);
+
+	nfs4_ff_layout_stat_io_start_read(
+		hdr->inode,
+		mirror,
+		dss_id,
+		hdr->args.count,
+		task->tk_start);
 }
 
 static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_mirror *mirror;
+	u32 dss_id;
+
 	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
 		return;
-	nfs4_ff_layout_stat_io_end_read(task,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count,
-			hdr->res.count);
+
+	mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+		mirror->dss_count,
+		hdr->args.offset);
+
+	nfs4_ff_layout_stat_io_end_read(
+		task,
+		mirror,
+		dss_id,
+		hdr->args.count,
+		hdr->res.count);
 	set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
 }
 
@@ -1549,11 +1761,17 @@ static void ff_layout_read_release(void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg);
+	u32 dss_id = nfs4_ff_layout_calc_dss_id(
+		flseg->stripe_unit,
+		flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count,
+		hdr->args.offset);
 	loff_t end_offs = 0;
 	int err;
 
 	if (task->tk_status < 0) {
-		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+		ff_layout_io_track_ds_error(hdr->lseg,
+					    hdr->pgio_mirror_idx, dss_id,
 					    hdr->args.offset, hdr->args.count,
 					    &hdr->res.op_status, OP_WRITE,
 					    task->tk_status);
@@ -1563,7 +1781,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 	err = ff_layout_async_handle_error(task, hdr->res.op_status,
 					   hdr->args.context->state,
 					   hdr->ds_clp, hdr->lseg,
-					   hdr->pgio_mirror_idx);
+					   hdr->pgio_mirror_idx,
+					   dss_id);
 
 	trace_nfs4_pnfs_write(hdr, err);
 	clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
@@ -1601,9 +1820,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 				     struct nfs_commit_data *data)
 {
 	int err;
+	u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index);
+	u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index);
 
 	if (task->tk_status < 0) {
-		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+		ff_layout_io_track_ds_error(data->lseg, idx, dss_id,
 					    data->args.offset, data->args.count,
 					    &data->res.op_status, OP_COMMIT,
 					    task->tk_status);
@@ -1611,8 +1832,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 	}
 
 	err = ff_layout_async_handle_error(task, data->res.op_status,
-					   NULL, data->ds_clp, data->lseg,
-					   data->ds_commit_index);
+					   NULL, data->ds_clp, data->lseg, idx,
+					   dss_id);
 
 	trace_nfs4_pnfs_commit_ds(data, err);
 	switch (err) {
@@ -1631,30 +1852,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 	}
 
 	ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
-
 	return 0;
 }
 
 static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_mirror *mirror;
+	u32 dss_id;
+
 	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
 		return;
-	nfs4_ff_layout_stat_io_start_write(hdr->inode,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count,
-			task->tk_start);
+
+	mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+		mirror->dss_count,
+		hdr->args.offset);
+
+	nfs4_ff_layout_stat_io_start_write(
+		hdr->inode,
+		mirror,
+		dss_id,
+		hdr->args.count,
+		task->tk_start);
 }
 
 static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
 		struct nfs_pgio_header *hdr)
 {
+	struct nfs4_ff_layout_mirror *mirror;
+	u32 dss_id;
+
 	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
 		return;
-	nfs4_ff_layout_stat_io_end_write(task,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count, hdr->res.count,
-			hdr->res.verf->committed);
+
+	mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit,
+		mirror->dss_count,
+		hdr->args.offset);
+
+	nfs4_ff_layout_stat_io_end_write(
+		task,
+		mirror,
+		dss_id,
+		hdr->args.count,
+		hdr->res.count,
+		hdr->res.verf->committed);
 	set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
 }
 
@@ -1737,10 +1982,16 @@ static void ff_layout_write_release(void *data)
 static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
 		struct nfs_commit_data *cdata)
 {
+	u32 idx, dss_id;
+
 	if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
 		return;
+
+	idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+	dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
 	nfs4_ff_layout_stat_io_start_write(cdata->inode,
-			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+			FF_LAYOUT_COMP(cdata->lseg, idx),
+			dss_id,
 			0, task->tk_start);
 }
 
@@ -1749,6 +2000,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
 {
 	struct nfs_page *req;
 	__u64 count = 0;
+	u32 idx, dss_id;
 
 	if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
 		return;
@@ -1757,8 +2009,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
 		list_for_each_entry(req, &cdata->pages, wb_list)
 			count += req->wb_bytes;
 	}
+
+	idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index);
+	dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index);
 	nfs4_ff_layout_stat_io_end_write(task,
-			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+			FF_LAYOUT_COMP(cdata->lseg, idx),
+			dss_id,
 			count, count, NFS_FILE_SYNC);
 	set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
 }
@@ -1872,6 +2128,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	u32 idx = hdr->pgio_mirror_idx;
 	int vers;
 	struct nfs_fh *fh;
+	u32 dss_id;
 	bool ds_fatal_error = false;
 
 	dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
@@ -1879,22 +2136,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 		hdr->args.pgbase, (size_t)hdr->args.count, offset);
 
 	mirror = FF_LAYOUT_COMP(lseg, idx);
-	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(lseg)->stripe_unit,
+		mirror->dss_count,
+		offset);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false);
 	if (IS_ERR(ds)) {
 		ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
 		goto out_failed;
 	}
 
 	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
-						   hdr->inode);
+						   hdr->inode, dss_id);
 	if (IS_ERR(ds_clnt))
 		goto out_failed;
 
-	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
 	if (!ds_cred)
 		goto out_failed;
 
-	vers = nfs4_ff_layout_ds_version(mirror);
+	vers = nfs4_ff_layout_ds_version(mirror, dss_id);
 
 	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
 		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
@@ -1902,11 +2163,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	hdr->pgio_done_cb = ff_layout_read_done_cb;
 	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
-	fh = nfs4_ff_layout_select_ds_fh(mirror);
+	fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
 	if (fh)
 		hdr->args.fh = fh;
 
-	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+	nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
 
 	/*
 	 * Note that if we ever decide to split across DSes,
@@ -1916,7 +2177,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 	hdr->mds_offset = offset;
 
 	/* Start IO accounting for local read */
-	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ);
+	localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
+				FMODE_READ);
 	if (localio) {
 		hdr->task.tk_start = ktime_get();
 		ff_layout_read_record_layoutstats_start(&hdr->task, hdr);
@@ -1953,25 +2215,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	int vers;
 	struct nfs_fh *fh;
 	u32 idx = hdr->pgio_mirror_idx;
+	u32 dss_id;
 	bool ds_fatal_error = false;
 
 	mirror = FF_LAYOUT_COMP(lseg, idx);
-	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+	dss_id = nfs4_ff_layout_calc_dss_id(
+		FF_LAYOUT_LSEG(lseg)->stripe_unit,
+		mirror->dss_count,
+		offset);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
 	if (IS_ERR(ds)) {
 		ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
 		goto out_failed;
 	}
 
 	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
-						   hdr->inode);
+						   hdr->inode, dss_id);
 	if (IS_ERR(ds_clnt))
 		goto out_failed;
 
-	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id);
 	if (!ds_cred)
 		goto out_failed;
 
-	vers = nfs4_ff_layout_ds_version(mirror);
+	vers = nfs4_ff_layout_ds_version(mirror, dss_id);
 
 	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
 		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
@@ -1981,12 +2248,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	hdr->pgio_done_cb = ff_layout_write_done_cb;
 	refcount_inc(&ds->ds_clp->cl_count);
 	hdr->ds_clp = ds->ds_clp;
-	hdr->ds_commit_idx = idx;
-	fh = nfs4_ff_layout_select_ds_fh(mirror);
+	hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id);
+	fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id);
 	if (fh)
 		hdr->args.fh = fh;
 
-	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
+	nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid);
 
 	/*
 	 * Note that if we ever decide to split across DSes,
@@ -1995,7 +2262,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 	hdr->args.offset = offset;
 
 	/* Start IO accounting for local write */
-	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+	localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
 				   FMODE_READ|FMODE_WRITE);
 	if (localio) {
 		hdr->task.tk_start = ktime_get();
@@ -2019,20 +2286,15 @@ out_failed:
 	return PNFS_NOT_ATTEMPTED;
 }
 
-static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
-{
-	return i;
-}
-
 static struct nfs_fh *
-select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id)
 {
 	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
 
 	/* FIXME: Assume that there is only one NFS version available
 	 * for the DS.
 	 */
-	return &flseg->mirror_array[i]->fh_versions[0];
+	return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0];
 }
 
 static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
@@ -2043,7 +2305,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	struct nfsd_file *localio;
 	struct nfs4_ff_layout_mirror *mirror;
 	const struct cred *ds_cred;
-	u32 idx;
+	u32 idx, dss_id;
 	int vers, ret;
 	struct nfs_fh *fh;
 
@@ -2051,22 +2313,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	    test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
 		goto out_err;
 
-	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+	idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index);
 	mirror = FF_LAYOUT_COMP(lseg, idx);
-	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
+	dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index);
+	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true);
 	if (IS_ERR(ds))
 		goto out_err;
 
 	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
-						   data->inode);
+						   data->inode, dss_id);
 	if (IS_ERR(ds_clnt))
 		goto out_err;
 
-	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
+	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id);
 	if (!ds_cred)
 		goto out_err;
 
-	vers = nfs4_ff_layout_ds_version(mirror);
+	vers = nfs4_ff_layout_ds_version(mirror, dss_id);
 
 	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
 		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
@@ -2075,12 +2338,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
 	data->cred = ds_cred;
 	refcount_inc(&ds->ds_clp->cl_count);
 	data->ds_clp = ds->ds_clp;
-	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+	fh = select_ds_fh_from_commit(lseg, idx, dss_id);
 	if (fh)
 		data->args.fh = fh;
 
 	/* Start IO accounting for local commit */
-	localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh,
+	localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh,
 				   FMODE_READ|FMODE_WRITE);
 	if (localio) {
 		data->task.tk_start = ktime_get();
@@ -2144,25 +2407,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
 	struct nfs4_pnfs_ds *ds;
 	struct nfs_client *ds_clp;
 	struct rpc_clnt *clnt;
-	u32 idx;
+	u32 idx, dss_id;
 
 	for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
 		mirror = flseg->mirror_array[idx];
-		mirror_ds = mirror->mirror_ds;
-		if (IS_ERR_OR_NULL(mirror_ds))
-			continue;
-		ds = mirror->mirror_ds->ds;
-		if (!ds)
-			continue;
-		ds_clp = ds->ds_clp;
-		if (!ds_clp)
-			continue;
-		clnt = ds_clp->cl_rpcclient;
-		if (!clnt)
-			continue;
-		if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg))
-			continue;
-		rpc_clnt_disconnect(clnt);
+		for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+			mirror_ds = mirror->dss[dss_id].mirror_ds;
+			if (IS_ERR_OR_NULL(mirror_ds))
+				continue;
+			ds = mirror->dss[dss_id].mirror_ds->ds;
+			if (!ds)
+				continue;
+			ds_clp = ds->ds_clp;
+			if (!ds_clp)
+				continue;
+			clnt = ds_clp->cl_rpcclient;
+			if (!clnt)
+				continue;
+			if (!rpc_cancel_tasks(clnt, -EAGAIN,
+					      ff_layout_match_io, lseg))
+				continue;
+			rpc_clnt_disconnect(clnt);
+		}
 	}
 }
 
@@ -2184,8 +2450,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
 	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
 	struct inode *inode = lseg->pls_layout->plh_inode;
 	struct pnfs_commit_array *array, *new;
+	u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count;
 
-	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+	new = pnfs_alloc_commit_array(size,
 				      nfs_io_gfp_mask());
 	if (new) {
 		spin_lock(&inode->i_lock);
@@ -2549,11 +2816,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
 static void
 ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
 			      const struct nfs42_layoutstat_devinfo *devinfo,
-			      struct nfs4_ff_layout_mirror *mirror)
+			      struct nfs4_ff_layout_ds_stripe *dss_info)
 {
 	struct nfs4_pnfs_ds_addr *da;
-	struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
-	struct nfs_fh *fh = &mirror->fh_versions[0];
+	struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds;
+	struct nfs_fh *fh = &dss_info->fh_versions[0];
 	__be32 *p;
 
 	da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
@@ -2565,13 +2832,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
 	p = xdr_reserve_space(xdr, 4 + fh->size);
 	xdr_encode_opaque(p, fh->data, fh->size);
 	/* ff_io_latency4 read */
-	spin_lock(&mirror->lock);
-	ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+	spin_lock(&dss_info->mirror->lock);
+	ff_layout_encode_io_latency(xdr,
+				    &dss_info->read_stat.io_stat);
 	/* ff_io_latency4 write */
-	ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
-	spin_unlock(&mirror->lock);
+	ff_layout_encode_io_latency(xdr,
+				    &dss_info->write_stat.io_stat);
+	spin_unlock(&dss_info->mirror->lock);
 	/* nfstime4 */
-	ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+	ff_layout_encode_nfstime(xdr,
+				 ktime_sub(ktime_get(),
+					   dss_info->start_time));
 	/* bool */
 	p = xdr_reserve_space(xdr, 4);
 	*p = cpu_to_be32(false);
@@ -2595,7 +2866,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
 static void
 ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
 {
-	struct nfs4_ff_layout_mirror *mirror = opaque->data;
+	struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data;
+	struct nfs4_ff_layout_mirror *mirror = dss_info->mirror;
 
 	ff_layout_put_mirror(mirror);
 }
@@ -2612,37 +2884,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
 {
 	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
 	struct nfs4_ff_layout_mirror *mirror;
+	struct nfs4_ff_layout_ds_stripe *dss_info;
 	struct nfs4_deviceid_node *dev;
-	int i = 0;
+	int i = 0, dss_id;
 
 	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
-		if (i >= dev_limit)
-			break;
-		if (IS_ERR_OR_NULL(mirror->mirror_ds))
-			continue;
-		if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
-					&mirror->flags) &&
-		    type != NFS4_FF_OP_LAYOUTRETURN)
-			continue;
-		/* mirror refcount put in cleanup_layoutstats */
-		if (!refcount_inc_not_zero(&mirror->ref))
-			continue;
-		dev = &mirror->mirror_ds->id_node; 
-		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
-		devinfo->offset = 0;
-		devinfo->length = NFS4_MAX_UINT64;
-		spin_lock(&mirror->lock);
-		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
-		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
-		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
-		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
-		spin_unlock(&mirror->lock);
-		devinfo->layout_type = LAYOUT_FLEX_FILES;
-		devinfo->ld_private.ops = &layoutstat_ops;
-		devinfo->ld_private.data = mirror;
-
-		devinfo++;
-		i++;
+		for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) {
+			dss_info = &mirror->dss[dss_id];
+			if (i >= dev_limit)
+				break;
+			if (IS_ERR_OR_NULL(dss_info->mirror_ds))
+				continue;
+			if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL,
+						&mirror->flags) &&
+			    type != NFS4_FF_OP_LAYOUTRETURN)
+				continue;
+			/* mirror refcount put in cleanup_layoutstats */
+			if (!refcount_inc_not_zero(&mirror->ref))
+				continue;
+			dev = &dss_info->mirror_ds->id_node;
+			memcpy(&devinfo->dev_id,
+			       &dev->deviceid,
+			       NFS4_DEVICEID4_SIZE);
+			devinfo->offset = 0;
+			devinfo->length = NFS4_MAX_UINT64;
+			spin_lock(&mirror->lock);
+			devinfo->read_count =
+			    dss_info->read_stat.io_stat.ops_completed;
+			devinfo->read_bytes =
+			    dss_info->read_stat.io_stat.bytes_completed;
+			devinfo->write_count =
+			    dss_info->write_stat.io_stat.ops_completed;
+			devinfo->write_bytes =
+			    dss_info->write_stat.io_stat.bytes_completed;
+			spin_unlock(&mirror->lock);
+			devinfo->layout_type = LAYOUT_FLEX_FILES;
+			devinfo->ld_private.ops = &layoutstat_ops;
+			devinfo->ld_private.data = &mirror->dss[dss_id];
+
+			devinfo++;
+			i++;
+		}
 	}
 	return i;
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 095df09017a5..17a008c8e97c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,8 @@
  * due to network error etc. */
 #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
 
+#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096
+
 /* LAYOUTSTATS report interval in ms */
 #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
 #define FF_LAYOUTSTATS_MAXDEV 4
@@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat {
 	struct nfs4_ff_busy_timer busy_timer;
 };
 
-struct nfs4_ff_layout_mirror {
-	struct pnfs_layout_hdr		*layout;
-	struct list_head		mirrors;
-	u32				ds_count;
-	u32				efficiency;
+struct nfs4_ff_layout_mirror;
+
+struct nfs4_ff_layout_ds_stripe {
+	struct nfs4_ff_layout_mirror   *mirror;
 	struct nfs4_deviceid		devid;
+	u32				efficiency;
 	struct nfs4_ff_layout_ds	*mirror_ds;
 	u32				fh_versions_cnt;
 	struct nfs_fh			*fh_versions;
@@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror {
 	const struct cred __rcu		*ro_cred;
 	const struct cred __rcu		*rw_cred;
 	struct nfs_file_localio		nfl;
-	refcount_t			ref;
-	spinlock_t			lock;
-	unsigned long			flags;
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	write_stat;
 	ktime_t				start_time;
+};
+
+struct nfs4_ff_layout_mirror {
+	struct pnfs_layout_hdr		*layout;
+	struct list_head		mirrors;
+	u32				dss_count;
+	struct nfs4_ff_layout_ds_stripe *dss;
+	refcount_t			ref;
+	spinlock_t			lock;
+	unsigned long			flags;
 	u32				report_interval;
 };
 
@@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
 }
 
 static inline struct nfs4_deviceid_node *
-FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id)
 {
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx);
 
 	if (mirror != NULL) {
-		struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds;
+		struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds;
 
 		if (!IS_ERR_OR_NULL(mirror_ds))
 			return &mirror_ds->id_node;
@@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg)
 }
 
 static inline int
-nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
+{
+	return mirror->dss[dss_id].mirror_ds->ds_versions[0].version;
+}
+
+static inline u32
+nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset)
 {
-	return mirror->mirror_ds->ds_versions[0].version;
+	u64 tmp = offset;
+
+	if (dss_count == 1 || stripe_unit == 0)
+		return 0;
+
+	do_div(tmp, stripe_unit);
+
+	return do_div(tmp, dss_count);
 }
 
 struct nfs4_ff_layout_ds *
@@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
 void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
-			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
-			     u64 length, int status, enum nfs_opnum4 opnum,
-			     gfp_t gfp_flags);
+			     struct nfs4_ff_layout_mirror *mirror,
+			     u32 dss_id, u64 offset, u64 length, int status,
+			     enum nfs_opnum4 opnum, gfp_t gfp_flags);
 void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg);
 int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
 void ff_layout_free_ds_ioerr(struct list_head *head);
@@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
 		struct list_head *head,
 		unsigned int maxnum);
 struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror);
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id);
 void
 nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
-		nfs4_stateid *stateid);
+				 u32 dss_id,
+				 nfs4_stateid *stateid);
 
 struct nfs4_pnfs_ds *
 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
 			  struct nfs4_ff_layout_mirror *mirror,
+			  u32 dss_id,
 			  bool fail_return);
 
 struct rpc_clnt *
 nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
 				 struct nfs_client *ds_clp,
-				 struct inode *inode);
+				 struct inode *inode,
+				 u32 dss_id);
 const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
 					 const struct pnfs_layout_range *range,
-					 const struct cred *mdscred);
+					 const struct cred *mdscred,
+					 u32 dss_id);
 bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
 bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 30365ec782bb..c55ea8fa3bfa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 {
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct folio *scratch;
 	struct list_head dsaddrs;
 	struct nfs4_pnfs_ds_addr *da;
 	struct nfs4_ff_layout_ds *new_ds = NULL;
@@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	int i, ret = -ENOMEM;
 
 	/* set up xdr stream */
-	scratch = alloc_page(gfp_flags);
+	scratch = folio_alloc(gfp_flags, 0);
 	if (!scratch)
 		goto out_err;
 
@@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 	INIT_LIST_HEAD(&dsaddrs);
 
 	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
-	xdr_set_scratch_page(&stream, scratch);
+	xdr_set_scratch_folio(&stream, scratch);
 
 	/* multipath count */
 	p = xdr_inline_decode(&stream, 4);
@@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 		kfree(da);
 	}
 
-	__free_page(scratch);
+	folio_put(scratch);
 	return new_ds;
 
 out_err_drain_dsaddrs:
@@ -177,7 +177,7 @@ out_err_drain_dsaddrs:
 
 	kfree(ds_versions);
 out_scratch:
-	__free_page(scratch);
+	folio_put(scratch);
 out_err:
 	kfree(new_ds);
 
@@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
 }
 
 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
-			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
-			     u64 length, int status, enum nfs_opnum4 opnum,
-			     gfp_t gfp_flags)
+			     struct nfs4_ff_layout_mirror *mirror,
+			     u32 dss_id, u64 offset, u64 length, int status,
+			     enum nfs_opnum4 opnum, gfp_t gfp_flags)
 {
 	struct nfs4_ff_layout_ds_err *dserr;
 
 	if (status == 0)
 		return 0;
 
-	if (IS_ERR_OR_NULL(mirror->mirror_ds))
+	if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds))
 		return -EINVAL;
 
 	dserr = kmalloc(sizeof(*dserr), gfp_flags);
@@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 	dserr->length = length;
 	dserr->status = status;
 	dserr->opnum = opnum;
-	nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
-	memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+	nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid);
+	memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid,
 	       NFS4_DEVICEID4_SIZE);
 
 	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
@@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 }
 
 static const struct cred *
-ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
+ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id)
 {
 	const struct cred *cred, __rcu **pcred;
 
 	if (iomode == IOMODE_READ)
-		pcred = &mirror->ro_cred;
+		pcred = &mirror->dss[dss_id].ro_cred;
 	else
-		pcred = &mirror->rw_cred;
+		pcred = &mirror->dss[dss_id].rw_cred;
 
 	rcu_read_lock();
 	do {
@@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
 }
 
 struct nfs_fh *
-nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
+nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id)
 {
 	/* FIXME: For now assume there is only 1 version available for the DS */
-	return &mirror->fh_versions[0];
+	return &mirror->dss[dss_id].fh_versions[0];
 }
 
 void
 nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
-		nfs4_stateid *stateid)
+				 u32 dss_id,
+				 nfs4_stateid *stateid)
 {
-	if (nfs4_ff_layout_ds_version(mirror) == 4)
-		nfs4_stateid_copy(stateid, &mirror->stateid);
+	if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4)
+		nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid);
 }
 
 static bool
 ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
-			 struct nfs4_ff_layout_mirror *mirror)
+			 struct nfs4_ff_layout_mirror *mirror,
+			 u32 dss_id)
 {
 	if (mirror == NULL)
 		goto outerr;
-	if (mirror->mirror_ds == NULL) {
+	if (mirror->dss[dss_id].mirror_ds == NULL) {
 		struct nfs4_deviceid_node *node;
 		struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
 
 		node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
-				&mirror->devid, lo->plh_lc_cred,
+				&mirror->dss[dss_id].devid, lo->plh_lc_cred,
 				GFP_KERNEL);
 		if (node)
 			mirror_ds = FF_LAYOUT_MIRROR_DS(node);
 
 		/* check for race with another call to this function */
-		if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+		if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) &&
 		    mirror_ds != ERR_PTR(-ENODEV))
 			nfs4_put_deviceid_node(node);
 	}
 
-	if (IS_ERR(mirror->mirror_ds))
+	if (IS_ERR(mirror->dss[dss_id].mirror_ds))
 		goto outerr;
 
 	return true;
@@ -352,6 +354,7 @@ outerr:
  * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
  * @lseg: the layout segment we're operating on
  * @mirror: layout mirror describing the DS to use
+ * @dss_id: DS stripe id to select stripe to use
  * @fail_return: return layout on connect failure?
  *
  * Try to prepare a DS connection to accept an RPC call. This involves
@@ -368,6 +371,7 @@ outerr:
 struct nfs4_pnfs_ds *
 nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
 			  struct nfs4_ff_layout_mirror *mirror,
+			  u32 dss_id,
 			  bool fail_return)
 {
 	struct nfs4_pnfs_ds *ds;
@@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
 	unsigned int max_payload;
 	int status = -EAGAIN;
 
-	if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
+	if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id))
 		goto noconnect;
 
-	ds = mirror->mirror_ds->ds;
+	ds = mirror->dss[dss_id].mirror_ds->ds;
 	if (READ_ONCE(ds->ds_clp))
 		goto out;
 	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
 	/* FIXME: For now we assume the server sent only one version of NFS
 	 * to use for the DS.
 	 */
-	status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
+	status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node,
 			     dataserver_timeo, dataserver_retrans,
-			     mirror->mirror_ds->ds_versions[0].version,
-			     mirror->mirror_ds->ds_versions[0].minor_version);
+			     mirror->dss[dss_id].mirror_ds->ds_versions[0].version,
+			     mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version);
 
 	/* connect success, check rsize/wsize limit */
 	if (!status) {
@@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
 		max_payload =
 			nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
 				       NULL);
-		if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
-			mirror->mirror_ds->ds_versions[0].rsize = max_payload;
-		if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
-			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+		if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload)
+			mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload;
+		if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload)
+			mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload;
 		goto out;
 	}
 noconnect:
 	ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
-				 mirror, lseg->pls_range.offset,
+				 mirror, dss_id, lseg->pls_range.offset,
 				 lseg->pls_range.length, NFS4ERR_NXIO,
 				 OP_ILLEGAL, GFP_NOIO);
 	ff_layout_send_layouterror(lseg);
@@ -426,12 +430,13 @@ out:
 const struct cred *
 ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
 		      const struct pnfs_layout_range *range,
-		      const struct cred *mdscred)
+		      const struct cred *mdscred,
+		      u32 dss_id)
 {
 	const struct cred *cred;
 
-	if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
-		cred = ff_layout_get_mirror_cred(mirror, range->iomode);
+	if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) {
+		cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id);
 		if (!cred)
 			cred = get_cred(mdscred);
 	} else {
@@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
  * @mirror: pointer to the mirror
  * @ds_clp: nfs_client for the DS
  * @inode: pointer to inode
+ * @dss_id: DS stripe id
  *
  * Find or create a DS rpc client with th MDS server rpc client auth flavor
  * in the nfs_client cl_ds_clients list.
  */
 struct rpc_clnt *
 nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
-				 struct nfs_client *ds_clp, struct inode *inode)
+				 struct nfs_client *ds_clp, struct inode *inode,
+				 u32 dss_id)
 {
-	switch (mirror->mirror_ds->ds_versions[0].version) {
+	switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) {
 	case 3:
 		/* For NFSv3 DS, flavor is set when creating DS connections */
 		return ds_clp->cl_rpcclient;
@@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *devid;
-	u32 idx;
+	u32 idx, dss_id;
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		if (mirror) {
-			if (!mirror->mirror_ds)
+		if (!mirror)
+			continue;
+		for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+			if (!mirror->dss[dss_id].mirror_ds)
 				return true;
-			if (IS_ERR(mirror->mirror_ds))
+			if (IS_ERR(mirror->dss[dss_id].mirror_ds))
 				continue;
-			devid = &mirror->mirror_ds->id_node;
+			devid = &mirror->dss[dss_id].mirror_ds->id_node;
 			if (!nfs4_test_deviceid_unavailable(devid))
 				return true;
 		}
@@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *devid;
-	u32 idx;
+	u32 idx, dss_id;
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		if (!mirror || IS_ERR(mirror->mirror_ds))
-			return false;
-		if (!mirror->mirror_ds)
-			continue;
-		devid = &mirror->mirror_ds->id_node;
-		if (nfs4_test_deviceid_unavailable(devid))
+		if (!mirror)
 			return false;
+		for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) {
+			if (IS_ERR(mirror->dss[dss_id].mirror_ds))
+				return false;
+			if (!mirror->dss[dss_id].mirror_ds)
+				continue;
+			devid = &mirror->dss[dss_id].mirror_ds->id_node;
+			if (nfs4_test_deviceid_unavailable(devid))
+				return false;
+		}
 	}
 
 	return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9bdaf7f38bed..18b57c7c2f97 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1073,6 +1073,21 @@ out_no_revalidate:
 	if (S_ISDIR(inode->i_mode))
 		stat->blksize = NFS_SERVER(inode)->dtsize;
 	stat->btime = NFS_I(inode)->btime;
+
+	/* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN
+	 * - NFS doesn't have DIO alignment constraints, avoid getting
+	 *   these DIO attrs from remote and just respond with most
+	 *   accommodating limits (so client will issue supported DIO).
+	 * - this is unintuitive, but the most coarse-grained
+	 *   dio_offset_align is the most accommodating.
+	 */
+	if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) &&
+	    S_ISREG(inode->i_mode)) {
+		stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+		stat->dio_mem_align = 4; /* 4-byte alignment */
+		stat->dio_offset_align = PAGE_SIZE;
+		stat->dio_read_offset_align = stat->dio_offset_align;
+	}
 out:
 	trace_nfs_getattr_exit(inode, err);
 	return err;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c0a44f389f8f..2ecd38e1d17a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
 
 #if IS_ENABLED(CONFIG_NFS_LOCALIO)
 /* localio.c */
+struct nfs_local_dio {
+	u32 mem_align;
+	u32 offset_align;
+	loff_t middle_offset;
+	loff_t end_offset;
+	ssize_t	start_len;	/* Length for misaligned first extent */
+	ssize_t	middle_len;	/* Length for DIO-aligned middle extent */
+	ssize_t	end_len;	/* Length for misaligned last extent */
+};
+
 extern void nfs_local_probe_async(struct nfs_client *);
 extern void nfs_local_probe_async_work(struct work_struct *);
 extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 97abf62f109d..2c0455e91571 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -30,6 +30,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+#define NFSLOCAL_MAX_IOS	3
+
 struct nfs_local_kiocb {
 	struct kiocb		kiocb;
 	struct bio_vec		*bvec;
@@ -37,6 +39,14 @@ struct nfs_local_kiocb {
 	struct work_struct	work;
 	void (*aio_complete_work)(struct work_struct *);
 	struct nfsd_file	*localio;
+	/* Begin mostly DIO-specific members */
+	size_t                  end_len;
+	short int		end_iter_index;
+	short int		n_iters;
+	bool			iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
+	loff_t                  offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
+	struct iov_iter		iters[NFSLOCAL_MAX_IOS];
+	/* End mostly DIO-specific members */
 };
 
 struct nfs_local_fsync_ctx {
@@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx {
 static bool localio_enabled __read_mostly = true;
 module_param(localio_enabled, bool, 0644);
 
-static bool localio_O_DIRECT_semantics __read_mostly = false;
-module_param(localio_O_DIRECT_semantics, bool, 0644);
-MODULE_PARM_DESC(localio_O_DIRECT_semantics,
-		 "LOCALIO will use O_DIRECT semantics to filesystem.");
-
 static inline bool nfs_client_is_local(const struct nfs_client *clp)
 {
 	return !!rcu_access_pointer(clp->cl_uuid.net);
@@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 		    struct nfsd_file __rcu **pnf,
 		    const fmode_t mode)
 {
+	int status = 0;
 	struct nfsd_file *localio;
 
 	localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
 				    cred, fh, nfl, pnf, mode);
 	if (IS_ERR(localio)) {
-		int status = PTR_ERR(localio);
-		trace_nfs_local_open_fh(fh, mode, status);
+		status = PTR_ERR(localio);
 		switch (status) {
 		case -ENOMEM:
 		case -ENXIO:
@@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 			nfs_local_probe(clp);
 		}
 	}
+	trace_nfs_local_open_fh(fh, mode, status);
 	return localio;
 }
 
@@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
 }
 EXPORT_SYMBOL_GPL(nfs_local_open_fh);
 
-static struct bio_vec *
-nfs_bvec_alloc_and_import_pagevec(struct page **pagevec,
-		unsigned int npages, gfp_t flags)
-{
-	struct bio_vec *bvec, *p;
-
-	bvec = kmalloc_array(npages, sizeof(*bvec), flags);
-	if (bvec != NULL) {
-		for (p = bvec; npages > 0; p++, pagevec++, npages--) {
-			p->bv_page = *pagevec;
-			p->bv_len = PAGE_SIZE;
-			p->bv_offset = 0;
-		}
-	}
-	return bvec;
-}
-
 static void
 nfs_local_iocb_free(struct nfs_local_kiocb *iocb)
 {
@@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
 {
 	struct nfs_local_kiocb *iocb;
 
-	iocb = kmalloc(sizeof(*iocb), flags);
+	iocb = kzalloc(sizeof(*iocb), flags);
 	if (iocb == NULL)
 		return NULL;
-	iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec,
-			hdr->page_array.npages, flags);
+
+	iocb->bvec = kmalloc_array(hdr->page_array.npages,
+				   sizeof(struct bio_vec), flags);
 	if (iocb->bvec == NULL) {
 		kfree(iocb);
 		return NULL;
 	}
 
-	if (localio_O_DIRECT_semantics &&
-	    test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
-		iocb->kiocb.ki_filp = file;
-		iocb->kiocb.ki_flags = IOCB_DIRECT;
-	} else
-		init_sync_kiocb(&iocb->kiocb, file);
+	init_sync_kiocb(&iocb->kiocb, file);
 
-	iocb->kiocb.ki_pos = hdr->args.offset;
 	iocb->hdr = hdr;
 	iocb->kiocb.ki_flags &= ~IOCB_APPEND;
 	iocb->aio_complete_work = NULL;
 
+	iocb->end_iter_index = -1;
+
 	return iocb;
 }
 
-static void
-nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir)
+static bool
+nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw,
+			  size_t len, struct nfs_local_dio *local_dio)
+{
+	struct nfs_pgio_header *hdr = iocb->hdr;
+	loff_t offset = hdr->args.offset;
+	u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align;
+	loff_t start_end, orig_end, middle_end;
+
+	nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align,
+			&nf_dio_offset_align, &nf_dio_read_offset_align);
+	if (rw == ITER_DEST)
+		nf_dio_offset_align = nf_dio_read_offset_align;
+
+	if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align))
+		return false;
+	if (unlikely(nf_dio_offset_align > PAGE_SIZE))
+		return false;
+	if (unlikely(len < nf_dio_offset_align))
+		return false;
+
+	local_dio->mem_align = nf_dio_mem_align;
+	local_dio->offset_align = nf_dio_offset_align;
+
+	start_end = round_up(offset, nf_dio_offset_align);
+	orig_end = offset + len;
+	middle_end = round_down(orig_end, nf_dio_offset_align);
+
+	local_dio->middle_offset = start_end;
+	local_dio->end_offset = middle_end;
+
+	local_dio->start_len = start_end - offset;
+	local_dio->middle_len = middle_end - start_end;
+	local_dio->end_len = orig_end - middle_end;
+
+	if (rw == ITER_DEST)
+		trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio);
+	else
+		trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio);
+	return true;
+}
+
+static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
+		unsigned int addr_mask, unsigned int len_mask)
+{
+	const struct bio_vec *bvec = i->bvec;
+	size_t skip = i->iov_offset;
+	size_t size = i->count;
+
+	if (size & len_mask)
+		return false;
+	do {
+		size_t len = bvec->bv_len;
+
+		if (len > size)
+			len = size;
+		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
+			return false;
+		bvec++;
+		size -= len;
+		skip = 0;
+	} while (size);
+
+	return true;
+}
+
+/*
+ * Setup as many as 3 iov_iter based on extents described by @local_dio.
+ * Returns the number of iov_iter that were setup.
+ */
+static int
+nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
+			  unsigned int nvecs, size_t len,
+			  struct nfs_local_dio *local_dio)
+{
+	int n_iters = 0;
+	struct iov_iter *iters = iocb->iters;
+
+	/* Setup misaligned start? */
+	if (local_dio->start_len) {
+		iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+		iters[n_iters].count = local_dio->start_len;
+		iocb->offset[n_iters] = iocb->hdr->args.offset;
+		iocb->iter_is_dio_aligned[n_iters] = false;
+		++n_iters;
+	}
+
+	/* Setup misaligned end?
+	 * If so, the end is purposely setup to be issued using buffered IO
+	 * before the middle (which will use DIO, if DIO-aligned, with AIO).
+	 * This creates problems if/when the end results in a partial write.
+	 * So must save index and length of end to handle this corner case.
+	 */
+	if (local_dio->end_len) {
+		iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+		iocb->offset[n_iters] = local_dio->end_offset;
+		iov_iter_advance(&iters[n_iters],
+			local_dio->start_len + local_dio->middle_len);
+		iocb->iter_is_dio_aligned[n_iters] = false;
+		/* Save index and length of end */
+		iocb->end_iter_index = n_iters;
+		iocb->end_len = local_dio->end_len;
+		++n_iters;
+	}
+
+	/* Setup DIO-aligned middle to be issued last, to allow for
+	 * DIO with AIO completion (see nfs_local_call_{read,write}).
+	 */
+	iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
+	if (local_dio->start_len)
+		iov_iter_advance(&iters[n_iters], local_dio->start_len);
+	iters[n_iters].count -= local_dio->end_len;
+	iocb->offset[n_iters] = local_dio->middle_offset;
+
+	iocb->iter_is_dio_aligned[n_iters] =
+		nfs_iov_iter_aligned_bvec(&iters[n_iters],
+			local_dio->mem_align-1, local_dio->offset_align-1);
+
+	if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
+		trace_nfs_local_dio_misaligned(iocb->hdr->inode,
+			iocb->hdr->args.offset, len, local_dio);
+		return 0; /* no DIO-aligned IO possible */
+	}
+	++n_iters;
+
+	iocb->n_iters = n_iters;
+	return n_iters;
+}
+
+static noinline_for_stack void
+nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
 {
 	struct nfs_pgio_header *hdr = iocb->hdr;
+	struct page **pagevec = hdr->page_array.pagevec;
+	unsigned long v, total;
+	unsigned int base;
+	size_t len;
+
+	v = 0;
+	total = hdr->args.count;
+	base = hdr->args.pgbase;
+	while (total && v < hdr->page_array.npages) {
+		len = min_t(size_t, total, PAGE_SIZE - base);
+		bvec_set_page(&iocb->bvec[v], *pagevec, len, base);
+		total -= len;
+		++pagevec;
+		++v;
+		base = 0;
+	}
+	len = hdr->args.count - total;
+
+	if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
+		struct nfs_local_dio local_dio;
+
+		if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
+		    nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+			return; /* is DIO-aligned */
+	}
 
-	iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages,
-		      hdr->args.count + hdr->args.pgbase);
-	if (hdr->args.pgbase != 0)
-		iov_iter_advance(i, hdr->args.pgbase);
+	/* Use buffered IO */
+	iocb->offset[0] = hdr->args.offset;
+	iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
+	iocb->n_iters = 1;
 }
 
 static void
@@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
 static void
 nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
 {
+	/* Must handle partial completions */
 	if (status >= 0) {
-		hdr->res.count = status;
-		hdr->res.op_status = NFS4_OK;
-		hdr->task.tk_status = 0;
+		hdr->res.count += status;
+		/* @hdr was initialized to 0 (zeroed during allocation) */
+		if (hdr->task.tk_status == 0)
+			hdr->res.op_status = NFS4_OK;
 	} else {
 		hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
 		hdr->task.tk_status = status;
@@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
 }
 
 static void
+nfs_local_iocb_release(struct nfs_local_kiocb *iocb)
+{
+	nfs_local_file_put(iocb->localio);
+	nfs_local_iocb_free(iocb);
+}
+
+static void
 nfs_local_pgio_release(struct nfs_local_kiocb *iocb)
 {
 	struct nfs_pgio_header *hdr = iocb->hdr;
 
-	nfs_local_file_put(iocb->localio);
-	nfs_local_iocb_free(iocb);
+	nfs_local_iocb_release(iocb);
 	nfs_local_hdr_release(hdr, hdr->task.tk_ops);
 }
 
@@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
 	struct nfs_pgio_header *hdr = iocb->hdr;
 	struct file *filp = iocb->kiocb.ki_filp;
 
-	nfs_local_pgio_done(hdr, status);
+	if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+		/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+		pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n");
+	}
 
 	/*
 	 * Must clear replen otherwise NFSv3 data corruption will occur
@@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
 	struct nfs_local_kiocb *iocb =
 		container_of(kiocb, struct nfs_local_kiocb, kiocb);
 
+	nfs_local_pgio_done(iocb->hdr, ret);
 	nfs_local_read_done(iocb, ret);
 	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
 }
@@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work)
 		container_of(work, struct nfs_local_kiocb, work);
 	struct file *filp = iocb->kiocb.ki_filp;
 	const struct cred *save_cred;
-	struct iov_iter iter;
 	ssize_t status;
 
 	save_cred = override_creds(filp->f_cred);
 
-	nfs_local_iter_init(&iter, iocb, READ);
+	for (int i = 0; i < iocb->n_iters ; i++) {
+		if (iocb->iter_is_dio_aligned[i]) {
+			iocb->kiocb.ki_flags |= IOCB_DIRECT;
+			iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+			iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+		}
 
-	status = filp->f_op->read_iter(&iocb->kiocb, &iter);
+		iocb->kiocb.ki_pos = iocb->offset[i];
+		status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
+		if (status != -EIOCBQUEUED) {
+			nfs_local_pgio_done(iocb->hdr, status);
+			if (iocb->hdr->task.tk_status)
+				break;
+		}
+	}
 
 	revert_creds(save_cred);
 
@@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work)
 }
 
 static int
-nfs_do_local_read(struct nfs_pgio_header *hdr,
-		  struct nfsd_file *localio,
+nfs_local_do_read(struct nfs_local_kiocb *iocb,
 		  const struct rpc_call_ops *call_ops)
 {
-	struct nfs_local_kiocb *iocb;
-	struct file *file = nfs_to->nfsd_file_file(localio);
-
-	/* Don't support filesystems without read_iter */
-	if (!file->f_op->read_iter)
-		return -EAGAIN;
+	struct nfs_pgio_header *hdr = iocb->hdr;
 
 	dprintk("%s: vfs_read count=%u pos=%llu\n",
 		__func__, hdr->args.count, hdr->args.offset);
 
-	iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL);
-	if (iocb == NULL)
-		return -ENOMEM;
-	iocb->localio = localio;
-
 	nfs_local_pgio_init(hdr, call_ops);
 	hdr->res.eof = false;
 
-	if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
-		iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
-		iocb->aio_complete_work = nfs_local_read_aio_complete_work;
-	}
-
 	INIT_WORK(&iocb->work, nfs_local_call_read);
 	queue_work(nfslocaliod_workqueue, &iocb->work);
 
@@ -529,7 +676,7 @@ nfs_set_local_verifier(struct inode *inode,
 }
 
 /* Factored out from fs/nfsd/vfs.h:fh_getattr() */
-static int __vfs_getattr(struct path *p, struct kstat *stat, int version)
+static int __vfs_getattr(const struct path *p, struct kstat *stat, int version)
 {
 	u32 request_mask = STATX_BASIC_STATS;
 
@@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
 
 	dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
 
+	if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
+		/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
+		pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n");
+	}
+
 	/* Handle short writes as if they are ENOSPC */
+	status = hdr->res.count;
 	if (status > 0 && status < hdr->args.count) {
 		hdr->mds_offset += status;
 		hdr->args.offset += status;
@@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
 		hdr->args.count -= status;
 		nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
 		status = -ENOSPC;
+		/* record -ENOSPC in terms of nfs_local_pgio_done */
+		nfs_local_pgio_done(hdr, status);
 	}
-	if (status < 0)
+	if (hdr->task.tk_status < 0)
 		nfs_reset_boot_verifier(inode);
-
-	nfs_local_pgio_done(hdr, status);
 }
 
 static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
 	struct nfs_local_kiocb *iocb =
 		container_of(kiocb, struct nfs_local_kiocb, kiocb);
 
+	nfs_local_pgio_done(iocb->hdr, ret);
 	nfs_local_write_done(iocb, ret);
 	nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
 }
@@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work)
 	struct file *filp = iocb->kiocb.ki_filp;
 	unsigned long old_flags = current->flags;
 	const struct cred *save_cred;
-	struct iov_iter iter;
 	ssize_t status;
 
 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
 	save_cred = override_creds(filp->f_cred);
 
-	nfs_local_iter_init(&iter, iocb, WRITE);
-
 	file_start_write(filp);
-	status = filp->f_op->write_iter(&iocb->kiocb, &iter);
+	for (int i = 0; i < iocb->n_iters ; i++) {
+		if (iocb->iter_is_dio_aligned[i]) {
+			iocb->kiocb.ki_flags |= IOCB_DIRECT;
+			iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+			iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+		}
+retry:
+		iocb->kiocb.ki_pos = iocb->offset[i];
+		status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
+		if (status != -EIOCBQUEUED) {
+			if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
+				/* partial write */
+				if (i == iocb->end_iter_index) {
+					/* Must not account partial end, otherwise, due
+					 * to end being issued before middle: the partial
+					 * write accounting in nfs_local_write_done()
+					 * would incorrectly advance hdr->args.offset
+					 */
+					status = 0;
+				} else {
+					/* Partial write at start or buffered middle,
+					 * exit early.
+					 */
+					nfs_local_pgio_done(iocb->hdr, status);
+					break;
+				}
+			} else if (unlikely(status == -ENOTBLK &&
+					    (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
+				/* VFS will return -ENOTBLK if DIO WRITE fails to
+				 * invalidate the page cache. Retry using buffered IO.
+				 */
+				iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+				iocb->kiocb.ki_complete = NULL;
+				iocb->aio_complete_work = NULL;
+				goto retry;
+			}
+			nfs_local_pgio_done(iocb->hdr, status);
+			if (iocb->hdr->task.tk_status)
+				break;
+		}
+	}
 	file_end_write(filp);
 
 	revert_creds(save_cred);
@@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work)
 }
 
 static int
-nfs_do_local_write(struct nfs_pgio_header *hdr,
-		   struct nfsd_file *localio,
+nfs_local_do_write(struct nfs_local_kiocb *iocb,
 		   const struct rpc_call_ops *call_ops)
 {
-	struct nfs_local_kiocb *iocb;
-	struct file *file = nfs_to->nfsd_file_file(localio);
-
-	/* Don't support filesystems without write_iter */
-	if (!file->f_op->write_iter)
-		return -EAGAIN;
+	struct nfs_pgio_header *hdr = iocb->hdr;
 
 	dprintk("%s: vfs_write count=%u pos=%llu %s\n",
 		__func__, hdr->args.count, hdr->args.offset,
 		(hdr->args.stable == NFS_UNSTABLE) ?  "unstable" : "stable");
 
-	iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO);
-	if (iocb == NULL)
-		return -ENOMEM;
-	iocb->localio = localio;
-
 	switch (hdr->args.stable) {
 	default:
 		break;
@@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr,
 
 	nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable);
 
-	if (iocb->kiocb.ki_flags & IOCB_DIRECT) {
-		iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
-		iocb->aio_complete_work = nfs_local_write_aio_complete_work;
-	}
-
 	INIT_WORK(&iocb->work, nfs_local_call_write);
 	queue_work(nfslocaliod_workqueue, &iocb->work);
 
 	return 0;
 }
 
+static struct nfs_local_kiocb *
+nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio)
+{
+	struct file *file = nfs_to->nfsd_file_file(localio);
+	struct nfs_local_kiocb *iocb;
+	gfp_t gfp_mask;
+	int rw;
+
+	if (hdr->rw_mode & FMODE_READ) {
+		if (!file->f_op->read_iter)
+			return ERR_PTR(-EOPNOTSUPP);
+		gfp_mask = GFP_KERNEL;
+		rw = ITER_DEST;
+	} else {
+		if (!file->f_op->write_iter)
+			return ERR_PTR(-EOPNOTSUPP);
+		gfp_mask = GFP_NOIO;
+		rw = ITER_SOURCE;
+	}
+
+	iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask);
+	if (iocb == NULL)
+		return ERR_PTR(-ENOMEM);
+	iocb->hdr = hdr;
+	iocb->localio = localio;
+
+	nfs_local_iters_init(iocb, rw);
+
+	return iocb;
+}
+
 int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio,
 		   struct nfs_pgio_header *hdr,
 		   const struct rpc_call_ops *call_ops)
 {
+	struct nfs_local_kiocb *iocb;
 	int status = 0;
 
 	if (!hdr->args.count)
 		return 0;
 
+	iocb = nfs_local_iocb_init(hdr, localio);
+	if (IS_ERR(iocb))
+		return PTR_ERR(iocb);
+
 	switch (hdr->rw_mode) {
 	case FMODE_READ:
-		status = nfs_do_local_read(hdr, localio, call_ops);
+		status = nfs_local_do_read(iocb, call_ops);
 		break;
 	case FMODE_WRITE:
-		status = nfs_do_local_write(hdr, localio, call_ops);
+		status = nfs_local_do_write(iocb, call_ops);
 		break;
 	default:
 		dprintk("%s: invalid mode: %d\n", __func__,
 			hdr->rw_mode);
-		status = -EINVAL;
+		status = -EOPNOTSUPP;
 	}
 
 	if (status != 0) {
 		if (status == -EAGAIN)
 			nfs_localio_disable_client(clp);
-		nfs_local_file_put(localio);
+		nfs_local_iocb_release(iocb);
 		hdr->task.tk_status = status;
 		nfs_local_hdr_release(hdr, call_ops);
 	}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 6e75c6c2d234..9eff09158518 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -23,8 +23,8 @@
 #include <linux/nfs2.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_common.h>
-#include "nfstrace.h"
 #include "internal.h"
+#include "nfstrace.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 4ae01c10b7e2..e17d72908412 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -23,8 +23,8 @@
 #include <linux/nfsacl.h>
 #include <linux/nfs_common.h>
 
-#include "nfstrace.h"
 #include "internal.h"
+#include "nfstrace.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6a0b5871ba3b..d537fb0c230e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
 
 
 	ret = -ENOMEM;
-	res.scratch = alloc_page(GFP_KERNEL);
+	res.scratch = folio_alloc(GFP_KERNEL, 0);
 	if (!res.scratch)
 		goto out;
 
@@ -1552,7 +1552,7 @@ out_free_pages:
 	}
 	kfree(pages);
 out_free_scratch:
-	__free_page(res.scratch);
+	folio_put(res.scratch);
 out:
 	return ret;
 
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 4cc915d5741d..e10d83ba835e 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
 	struct compound_hdr hdr;
 	int status;
 
-	xdr_set_scratch_page(xdr, res->scratch);
+	xdr_set_scratch_folio(xdr, res->scratch);
 
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index c9a0d1e420c6..7f43e890d356 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = {
 #else
 	.llseek		= nfs_file_llseek,
 #endif
+	.fop_flags	= FOP_DONTCACHE,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ce61253efd45..f58098417142 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	*p++ = htonl(attrs);                           /* bitmap */
 	*p++ = htonl(12);             /* attribute buffer length */
 	*p++ = htonl(NF4DIR);
+	spin_lock(&dentry->d_lock);
 	p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
+	spin_unlock(&dentry->d_lock);
 
 	readdir->pgbase = (char *)p - (char *)start;
 	readdir->count -= readdir->pgbase;
@@ -6160,7 +6162,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
 	}
 
 	/* for decoding across pages */
-	res.acl_scratch = alloc_page(GFP_KERNEL);
+	res.acl_scratch = folio_alloc(GFP_KERNEL, 0);
 	if (!res.acl_scratch)
 		goto out_free;
 
@@ -6196,7 +6198,7 @@ out_free:
 	while (--i >= 0)
 		__free_page(pages[i]);
 	if (res.acl_scratch)
-		__free_page(res.acl_scratch);
+		folio_put(res.acl_scratch);
 	kfree(pages);
 	return ret;
 }
@@ -7872,10 +7874,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
 		return err;
 	do {
 		err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
-		if (err != -NFS4ERR_DELAY)
+		if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE)
 			break;
 		ssleep(1);
-	} while (err == -NFS4ERR_DELAY);
+	} while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE);
 	return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
 }
 
@@ -9442,7 +9444,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
 		goto out;
 	if (rcvd->max_rqst_sz > sent->max_rqst_sz)
 		return -EINVAL;
-	if (rcvd->max_resp_sz < sent->max_resp_sz)
+	if (rcvd->max_resp_sz > sent->max_resp_sz)
 		return -EINVAL;
 	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
 		return -EINVAL;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 7612e977e80b..01179f7de322 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2744,6 +2744,9 @@ out_error:
 	case -ENETUNREACH:
 		nfs_mark_client_ready(clp, -EIO);
 		break;
+	case -EINVAL:
+		nfs_mark_client_ready(clp, status);
+		break;
 	default:
 		ssleep(1);
 		break;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 49ff98571fa5..1d0e6c10f921 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
 }
 
 /*
- * The prefered block size for layout directed io
+ * The preferred block size for layout directed io
  */
 static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
 				      uint32_t *res)
@@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	int status;
 
 	if (res->acl_scratch != NULL)
-		xdr_set_scratch_page(xdr, res->acl_scratch);
+		xdr_set_scratch_folio(xdr, res->acl_scratch);
 	status = decode_compound_hdr(xdr, &hdr);
 	if (status)
 		goto out;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 627115179795..6ce55e8e6b67 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -45,6 +45,23 @@
 			{ BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \
 			{ BIT(NFS_INO_ODIRECT), "ODIRECT" })
 
+#define nfs_show_wb_flags(v) \
+	__print_flags(v, "|", \
+			{ BIT(PG_BUSY), "BUSY" }, \
+			{ BIT(PG_MAPPED), "MAPPED" }, \
+			{ BIT(PG_FOLIO), "FOLIO" }, \
+			{ BIT(PG_CLEAN), "CLEAN" }, \
+			{ BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \
+			{ BIT(PG_INODE_REF), "INODE_REF" }, \
+			{ BIT(PG_HEADLOCK), "HEADLOCK" }, \
+			{ BIT(PG_TEARDOWN), "TEARDOWN" }, \
+			{ BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \
+			{ BIT(PG_UPTODATE), "UPTODATE" }, \
+			{ BIT(PG_WB_END), "WB_END" }, \
+			{ BIT(PG_REMOVE), "REMOVE" }, \
+			{ BIT(PG_CONTENDED1), "CONTENDED1" }, \
+			{ BIT(PG_CONTENDED2), "CONTENDED2" })
+
 DECLARE_EVENT_CLASS(nfs_inode_event,
 		TP_PROTO(
 			const struct inode *inode
@@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event,
 			__entry->fileid = nfsi->fileid;
 			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
 			__entry->version = inode_peek_iversion_raw(inode);
-			__entry->offset = offset,
+			__entry->offset = offset;
 			__entry->count = count;
 		),
 
@@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done,
 			__entry->fileid = nfsi->fileid;
 			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
 			__entry->version = inode_peek_iversion_raw(inode);
-			__entry->offset = offset,
-			__entry->count = count,
+			__entry->offset = offset;
+			__entry->count = count;
 			__entry->ret = ret;
 		),
 
@@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done);
 DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio);
 DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done);
 
+DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_update_folio);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_begin);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_write_end);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done);
+
+DEFINE_NFS_FOLIO_EVENT(nfs_writepages);
+DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done);
+
+DECLARE_EVENT_CLASS(nfs_kiocb_event,
+		TP_PROTO(
+			const struct kiocb *iocb,
+			const struct iov_iter *iter
+		),
+
+		TP_ARGS(iocb, iter),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u64, version)
+			__field(loff_t, offset)
+			__field(size_t, count)
+			__field(int, flags)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = file_inode(iocb->ki_filp);
+			const struct nfs_inode *nfsi = NFS_I(inode);
+
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->version = inode_peek_iversion_raw(inode);
+			__entry->offset = iocb->ki_pos;
+			__entry->count = iov_iter_count(iter);
+			__entry->flags = iocb->ki_flags;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle, __entry->version,
+			__entry->offset, __entry->count,
+			__print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS)
+		)
+);
+
+#define DEFINE_NFS_KIOCB_EVENT(name) \
+	DEFINE_EVENT(nfs_kiocb_event, name, \
+			TP_PROTO( \
+				const struct kiocb *iocb, \
+				const struct iov_iter *iter \
+			), \
+			TP_ARGS(iocb, iter))
+
+DEFINE_NFS_KIOCB_EVENT(nfs_file_read);
+DEFINE_NFS_KIOCB_EVENT(nfs_file_write);
+
 TRACE_EVENT(nfs_aop_readahead,
 		TP_PROTO(
 			const struct inode *inode,
@@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done,
 		)
 );
 
+DECLARE_EVENT_CLASS(nfs_page_class,
+		TP_PROTO(
+			const struct nfs_page *req
+		),
+
+		TP_ARGS(req),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(const struct nfs_page *__private, req)
+			__field(loff_t, offset)
+			__field(unsigned int, count)
+			__field(unsigned long, flags)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = folio_inode(req->wb_folio);
+			const struct nfs_inode *nfsi = NFS_I(inode);
+
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->req = req;
+			__entry->offset = req_offset(req);
+			__entry->count = req->wb_bytes;
+			__entry->flags = req->wb_flags;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid, __entry->fhandle,
+			__entry->req, __entry->offset, __entry->count,
+			nfs_show_wb_flags(__entry->flags)
+		)
+);
+
+#define DEFINE_NFS_PAGE_EVENT(name) \
+	DEFINE_EVENT(nfs_page_class, name, \
+			TP_PROTO( \
+				const struct nfs_page *req \
+			), \
+			TP_ARGS(req))
+
+DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup);
+DEFINE_NFS_PAGE_EVENT(nfs_do_writepage);
+
 DECLARE_EVENT_CLASS(nfs_page_error_class,
 		TP_PROTO(
 			const struct inode *inode,
@@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion);
 DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec);
 DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io);
 
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+DECLARE_EVENT_CLASS(nfs_local_dio_class,
+	TP_PROTO(
+		const struct inode *inode,
+		loff_t offset,
+		ssize_t count,
+		const struct nfs_local_dio *local_dio
+	),
+	TP_ARGS(inode, offset, count, local_dio),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, fileid)
+		__field(u32, fhandle)
+		__field(loff_t, offset)
+		__field(ssize_t, count)
+		__field(u32, mem_align)
+		__field(u32, offset_align)
+		__field(loff_t, start)
+		__field(ssize_t, start_len)
+		__field(loff_t, middle)
+		__field(ssize_t, middle_len)
+		__field(loff_t, end)
+		__field(ssize_t, end_len)
+	),
+	TP_fast_assign(
+		const struct nfs_inode *nfsi = NFS_I(inode);
+		const struct nfs_fh *fh = &nfsi->fh;
+
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->fileid = nfsi->fileid;
+		__entry->fhandle = nfs_fhandle_hash(fh);
+		__entry->offset = offset;
+		__entry->count = count;
+		__entry->mem_align = local_dio->mem_align;
+		__entry->offset_align = local_dio->offset_align;
+		__entry->start = offset;
+		__entry->start_len = local_dio->start_len;
+		__entry->middle = local_dio->middle_offset;
+		__entry->middle_len = local_dio->middle_len;
+		__entry->end = local_dio->end_offset;
+		__entry->end_len = local_dio->end_len;
+	),
+	TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x "
+		  "offset=%lld count=%zd "
+		  "mem_align=%u offset_align=%u "
+		  "start=%llu+%zd middle=%llu+%zd end=%llu+%zd",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long long)__entry->fileid,
+		  __entry->fhandle, __entry->offset, __entry->count,
+		  __entry->mem_align, __entry->offset_align,
+		  __entry->start, __entry->start_len,
+		  __entry->middle, __entry->middle_len,
+		  __entry->end, __entry->end_len)
+)
+
+#define DEFINE_NFS_LOCAL_DIO_EVENT(name)		\
+DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name,	\
+	TP_PROTO(const struct inode *inode,		\
+		 loff_t offset,				\
+		 ssize_t count,				\
+		 const struct nfs_local_dio *local_dio),\
+	TP_ARGS(inode, offset, count, local_dio))
+
+DEFINE_NFS_LOCAL_DIO_EVENT(read);
+DEFINE_NFS_LOCAL_DIO_EVENT(write);
+DEFINE_NFS_LOCAL_DIO_EVENT(misaligned);
+
+#endif /* CONFIG_NFS_LOCALIO */
+
 TRACE_EVENT(nfs_fh_to_dentry,
 		TP_PROTO(
 			const struct super_block *sb,
@@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh,
 		),
 
 		TP_printk(
-			"error=%d fhandle=0x%08x mode=%s",
-			__entry->error,
+			"fhandle=0x%08x mode=%s result=%d",
 			__entry->fhandle,
-			show_fs_fmode_flags(__entry->fmode)
+			show_fs_fmode_flags(__entry->fmode),
+			__entry->error
 		)
 );
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 647c53d1418a..0fb6905736d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio)
 {
 	struct nfs_server *nfss = NFS_SERVER(folio->mapping->host);
 
-	folio_end_writeback(folio);
+	folio_end_writeback_no_dropbehind(folio);
 	if (atomic_long_dec_return(&nfss->writeback) <
 	    NFS_CONGESTION_OFF_THRESH) {
 		nfss->write_congested = 0;
@@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	trace_nfs_do_writepage(req);
 	nfs_folio_set_writeback(folio);
 	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
 
@@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	int priority = 0;
 	int err;
 
+	trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start);
+
 	/* Wait with writeback until write congestion eases */
 	if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) {
 		err = wait_event_killable(nfss->write_congestion_wait,
 					  nfss->write_congested == 0);
 		if (err)
-			return err;
+			goto out_err;
 	}
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
@@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	} while (err < 0 && !nfs_error_is_fatal(err));
 	nfs_io_completion_put(ioc);
 
-	if (err < 0)
-		goto out_err;
-	return 0;
+	if (err > 0)
+		err = 0;
 out_err:
+	trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err);
 	return err;
 }
 
@@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 			clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
 		}
 		spin_unlock(&mapping->i_private_lock);
+
+		folio_end_dropbehind(folio);
 	}
 	nfs_page_group_unlock(req);
 
@@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 			req->wb_nio = 0;
 			memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
-				hdr->pgio_mirror_idx);
+				hdr->ds_commit_idx);
 			goto next;
 		}
 remove_req:
@@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
 	unsigned int end;
 	int error;
 
+	trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes);
 	end = offset + bytes;
 
 	req = nfs_lock_and_join_requests(folio);
 	if (IS_ERR_OR_NULL(req))
-		return req;
+		goto out;
 
 	rqend = req->wb_offset + req->wb_bytes;
 	/*
@@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio,
 	else
 		req->wb_bytes = rqend - req->wb_offset;
 	req->wb_nio = 0;
+out:
+	trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes,
+					     PTR_ERR_OR_ZERO(req));
 	return req;
 out_flushme:
 	/*
@@ -1053,6 +1062,7 @@ out_flushme:
 	nfs_mark_request_dirty(req);
 	nfs_unlock_and_release_request(req);
 	error = nfs_wb_folio(folio->mapping->host, folio);
+	trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error);
 	return (error < 0) ? ERR_PTR(error) : NULL;
 }
 
@@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx,
 	req = nfs_setup_write_request(ctx, folio, offset, count);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
+	trace_nfs_writepage_setup(req);
 	/* Update file length */
 	nfs_grow_file(folio, offset, count);
 	nfs_mark_uptodate(req);
@@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio,
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
 
+	trace_nfs_update_folio(inode, offset, count);
+
 	dprintk("NFS:       nfs_update_folio(%pD2 %d@%lld)\n", file, count,
 		(long long)(folio_pos(folio) + offset));
 
@@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio,
 	if (status < 0)
 		nfs_set_pageerror(mapping);
 out:
+	trace_nfs_update_folio_done(inode, offset, count, status);
 	dprintk("NFS:       nfs_update_folio returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
 	return status;
@@ -1806,7 +1820,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 				nfs_mapping_set_error(folio, status);
 				nfs_inode_remove_request(req);
 			}
-			dprintk_cont(", error = %d\n", status);
+			dprintk(", error = %d\n", status);
 			goto next;
 		}
 
@@ -1816,11 +1830,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 			/* We have a match */
 			if (folio)
 				nfs_inode_remove_request(req);
-			dprintk_cont(" OK\n");
+			dprintk(" OK\n");
 			goto next;
 		}
 		/* We have a mismatch. Write the page again */
-		dprintk_cont(" mismatch\n");
+		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 		atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
 	next:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cadfc2bae60e..caa695c06efb 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -402,7 +402,7 @@ static struct svc_export *svc_export_update(struct svc_export *new,
 					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
 
-static int check_export(struct path *path, int *flags, unsigned char *uuid)
+static int check_export(const struct path *path, int *flags, unsigned char *uuid)
 {
 	struct inode *inode = d_inode(path->dentry);
 
@@ -1181,7 +1181,7 @@ denied:
  * use exp_get_by_name() or exp_find().
  */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index b9c0adb3ce09..cb36e6cce829 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -111,7 +111,7 @@ int			nfsd_export_init(struct net *);
 void			nfsd_export_shutdown(struct net *);
 void			nfsd_export_flush(struct net *);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct path *);
+					     const struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
 					struct path *);
 struct svc_export *	rqst_find_fsidzero_export(struct svc_rqst *);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 85ca663c052c..e010d90aeb27 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
 	refcount_set(&nf->nf_ref, 1);
 	nf->nf_may = need;
 	nf->nf_mark = NULL;
+	nf->nf_dio_mem_align = 0;
+	nf->nf_dio_offset_align = 0;
+	nf->nf_dio_read_offset_align = 0;
 	return nf;
 }
 
@@ -1070,6 +1073,35 @@ nfsd_file_is_cached(struct inode *inode)
 }
 
 static __be32
+nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf)
+{
+	struct inode *inode = file_inode(nf->nf_file);
+	struct kstat stat;
+	__be32 status;
+
+	/* Currently only need to get DIO alignment info for regular files */
+	if (!S_ISREG(inode->i_mode))
+		return nfs_ok;
+
+	status = fh_getattr(fhp, &stat);
+	if (status != nfs_ok)
+		return status;
+
+	trace_nfsd_file_get_dio_attrs(inode, &stat);
+
+	if (stat.result_mask & STATX_DIOALIGN) {
+		nf->nf_dio_mem_align = stat.dio_mem_align;
+		nf->nf_dio_offset_align = stat.dio_offset_align;
+	}
+	if (stat.result_mask & STATX_DIO_READ_ALIGN)
+		nf->nf_dio_read_offset_align = stat.dio_read_offset_align;
+	else
+		nf->nf_dio_read_offset_align = nf->nf_dio_offset_align;
+
+	return nfs_ok;
+}
+
+static __be32
 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
 		     struct svc_cred *cred,
 		     struct auth_domain *client,
@@ -1187,6 +1219,8 @@ open_file:
 			}
 			status = nfserrno(ret);
 			trace_nfsd_file_open(nf, status);
+			if (status == nfs_ok)
+				status = nfsd_file_get_dio_attrs(fhp, nf);
 		}
 	} else
 		status = nfserr_jukebox;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 722b26c71e45..237a05c74211 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -54,6 +54,10 @@ struct nfsd_file {
 	struct list_head	nf_gc;
 	struct rcu_head		nf_rcu;
 	ktime_t			nf_birthtime;
+
+	u32			nf_dio_mem_align;
+	u32			nf_dio_offset_align;
+	u32			nf_dio_read_offset_align;
 };
 
 int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index cb237f1b902a..9e0a37cd29d8 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
 	return localio;
 }
 
+static void nfsd_file_dio_alignment(struct nfsd_file *nf,
+				    u32 *nf_dio_mem_align,
+				    u32 *nf_dio_offset_align,
+				    u32 *nf_dio_read_offset_align)
+{
+	*nf_dio_mem_align = nf->nf_dio_mem_align;
+	*nf_dio_offset_align = nf->nf_dio_offset_align;
+	*nf_dio_read_offset_align = nf->nf_dio_read_offset_align;
+}
+
 static const struct nfsd_localio_operations nfsd_localio_ops = {
 	.nfsd_net_try_get  = nfsd_net_try_get,
 	.nfsd_net_put  = nfsd_net_put,
@@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = {
 	.nfsd_file_put_local = nfsd_file_put_local,
 	.nfsd_file_get_local = nfsd_file_get_local,
 	.nfsd_file_file = nfsd_file_file,
+	.nfsd_file_dio_alignment = nfsd_file_dio_alignment,
 };
 
 void nfsd_localio_ops_init(void)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c19a74a36c45..2b79129703d5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1954,7 +1954,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
 	 * remaining listeners and recreate the list.
 	 */
 	if (delete)
-		svc_xprt_destroy_all(serv, net);
+		svc_xprt_destroy_all(serv, net, false);
 
 	/* walk list of addrs again, open any that still don't exist */
 	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 82b0111ac469..7057ddd7a0a8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net)
 #endif
 	}
 
-	svc_xprt_destroy_all(serv, net);
-
 	/*
 	 * write_ports can create the server without actually starting
-	 * any threads--if we get shut down before any threads are
+	 * any threads.  If we get shut down before any threads are
 	 * started, then nfsd_destroy_serv will be run before any of this
 	 * other initialization has been done except the rpcb information.
 	 */
-	svc_rpcb_cleanup(serv, net);
-
+	svc_xprt_destroy_all(serv, net, true);
 	nfsd_shutdown_net(net);
 	svc_destroy(&serv);
 }
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a664fdf1161e..6e2c8e2aab10 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc,
 	)
 );
 
+TRACE_EVENT(nfsd_file_get_dio_attrs,
+	TP_PROTO(
+		const struct inode *inode,
+		const struct kstat *stat
+	),
+	TP_ARGS(inode, stat),
+	TP_STRUCT__entry(
+		__field(const void *, inode)
+		__field(unsigned long, mask)
+		__field(u32, mem_align)
+		__field(u32, offset_align)
+		__field(u32, read_offset_align)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->mask = stat->result_mask;
+		__entry->mem_align = stat->dio_mem_align;
+		__entry->offset_align = stat->dio_offset_align;
+		__entry->read_offset_align = stat->dio_read_offset_align;
+	),
+	TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u",
+		__entry->inode, show_statx_mask(__entry->mask),
+		__entry->mem_align, __entry->offset_align,
+		__entry->read_offset_align
+	)
+);
+
 TRACE_EVENT(nfsd_file_acquire,
 	TP_PROTO(
 		const struct svc_rqst *rqstp,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index eff04959606f..fde3e0c11dba 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -185,6 +185,10 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
 	u32 request_mask = STATX_BASIC_STATS;
 	struct path p = {.mnt = fh->fh_export->ex_path.mnt,
 			 .dentry = fh->fh_dentry};
+	struct inode *inode = d_inode(p.dentry);
+
+	if (S_ISREG(inode->i_mode))
+		request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN);
 
 	if (fh->fh_maxsize == NFS4_FHSIZE)
 		request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e7fd8a790aaa..648dc59bef7f 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -571,7 +571,7 @@ static int nsfs_export_permission(struct handle_to_path_ctx *ctx,
 	return 0;
 }
 
-static struct file *nsfs_export_open(struct path *path, unsigned int oflags)
+static struct file *nsfs_export_open(const struct path *path, unsigned int oflags)
 {
 	return file_open_root(path, "", oflags, 0);
 }
diff --git a/fs/open.c b/fs/open.c
index 4890b13461c7..3d64372ecc67 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1022,8 +1022,8 @@ cleanup_all:
 	put_file_access(f);
 cleanup_file:
 	path_put(&f->f_path);
-	f->f_path.mnt = NULL;
-	f->f_path.dentry = NULL;
+	f->__f_path.mnt = NULL;
+	f->__f_path.dentry = NULL;
 	f->f_inode = NULL;
 	return error;
 }
@@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry,
 {
 	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
 
-	file->f_path.dentry = dentry;
+	file->__f_path.dentry = dentry;
 	return do_dentry_open(file, open);
 }
 EXPORT_SYMBOL(finish_open);
@@ -1073,7 +1073,7 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 {
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
-	file->f_path.dentry = dentry;
+	file->__f_path.dentry = dentry;
 	return 0;
 }
 EXPORT_SYMBOL(finish_no_open);
@@ -1093,7 +1093,7 @@ int vfs_open(const struct path *path, struct file *file)
 {
 	int ret;
 
-	file->f_path = *path;
+	file->__f_path = *path;
 	ret = do_dentry_open(file, NULL);
 	if (!ret) {
 		/*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 66bd43a99d2e..aac7e34f56c1 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -242,7 +242,7 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
 	return 0;
 }
 
-static int ovl_sync_file(struct path *path)
+static int ovl_sync_file(const struct path *path)
 {
 	struct file *new_file;
 	int err;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index f5b8877d5fe2..fc52c796061d 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -120,7 +120,7 @@ static bool ovl_is_real_file(const struct file *realfile,
 }
 
 static struct file *ovl_real_file_path(const struct file *file,
-				       struct path *realpath)
+				       const struct path *realpath)
 {
 	struct ovl_file *of = file->private_data;
 	struct file *realfile = of->realfile;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6fd4fbfb0908..c8fd5951fc5e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -562,11 +562,11 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
 			   struct ovl_metacopy *metacopy);
 bool ovl_is_metacopy_dentry(struct dentry *dentry);
 char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
-int ovl_ensure_verity_loaded(struct path *path);
+int ovl_ensure_verity_loaded(const struct path *path);
 int ovl_validate_verity(struct ovl_fs *ofs,
-			struct path *metapath,
-			struct path *datapath);
-int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+			const struct path *metapath,
+			const struct path *datapath);
+int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src,
 			  struct ovl_metacopy *metacopy);
 int ovl_sync_status(struct ovl_fs *ofs);
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 34875c46dbe2..43ee4c7296a7 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -404,7 +404,7 @@ static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
 	return err;
 }
 
-static int ovl_lower_dir(const char *name, struct path *path,
+static int ovl_lower_dir(const char *name, const struct path *path,
 			 struct ovl_fs *ofs, int *stack_depth)
 {
 	int fh_type;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index bec4a39d1b97..f76672f2e686 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1381,7 +1381,7 @@ err_free:
 }
 
 /* Call with mounter creds as it may open the file */
-int ovl_ensure_verity_loaded(struct path *datapath)
+int ovl_ensure_verity_loaded(const struct path *datapath)
 {
 	struct inode *inode = d_inode(datapath->dentry);
 	struct file *filp;
@@ -1401,8 +1401,8 @@ int ovl_ensure_verity_loaded(struct path *datapath)
 }
 
 int ovl_validate_verity(struct ovl_fs *ofs,
-			struct path *metapath,
-			struct path *datapath)
+			const struct path *metapath,
+			const struct path *datapath)
 {
 	struct ovl_metacopy metacopy_data;
 	u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE];
@@ -1455,7 +1455,7 @@ int ovl_validate_verity(struct ovl_fs *ofs,
 	return 0;
 }
 
-int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src,
 			  struct ovl_metacopy *metacopy)
 {
 	int err, digest_size;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 44a95cd27377..0ef5b47d796a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -850,7 +850,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
 	return 0;
 }
 
-static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
+static struct file *pidfs_export_open(const struct path *path, unsigned int oflags)
 {
 	/*
 	 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index c96e5d934ba9..891ed2dc2b73 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -73,7 +73,7 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
 {
 	struct qstr last;
 	struct filename *filename __free(putname) = NULL;
-	struct path *root_share_path = &share_conf->vfs_path;
+	const struct path *root_share_path = &share_conf->vfs_path;
 	int err, type;
 	struct dentry *d;
 
@@ -1306,7 +1306,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath,
 				     caseless, true);
 }
 
-void ksmbd_vfs_kern_path_unlock(struct path *path)
+void ksmbd_vfs_kern_path_unlock(const struct path *path)
 {
 	/* While lock is still held, ->d_parent is safe */
 	inode_unlock(d_inode(path->dentry->d_parent));
@@ -1867,7 +1867,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
 }
 
 int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
-				 struct path *path)
+				 const struct path *path)
 {
 	struct posix_acl_state acl_state;
 	struct posix_acl *acls;
@@ -1920,7 +1920,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
 }
 
 int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
-				struct path *path, struct inode *parent_inode)
+				const struct path *path, struct inode *parent_inode)
 {
 	struct posix_acl *acls;
 	struct posix_acl_entry *pace;
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index d47472f3e30b..df6421b4590b 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -123,7 +123,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
 int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
 			       unsigned int flags,
 			       struct path *path, bool caseless);
-void ksmbd_vfs_kern_path_unlock(struct path *path);
+void ksmbd_vfs_kern_path_unlock(const struct path *path);
 struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
 					  const char *name,
 					  unsigned int flags,
@@ -164,8 +164,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
 				   struct dentry *dentry,
 				   struct xattr_dos_attrib *da);
 int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap,
-				 struct path *path);
+				 const struct path *path);
 int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap,
-				struct path *path,
+				const struct path *path,
 				struct inode *parent_inode);
 #endif /* __KSMBD_VFS_H__ */
diff --git a/fs/stat.c b/fs/stat.c
index f95c1dc3eaa4..6c79661e1b96 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -293,7 +293,7 @@ static int statx_lookup_flags(int flags)
 	return lookup_flags;
 }
 
-static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
+static int vfs_statx_path(const struct path *path, int flags, struct kstat *stat,
 			  u32 request_mask)
 {
 	int error = vfs_getattr(path, stat, request_mask, flags);
diff --git a/include/dt-bindings/clock/axis,artpec8-clk.h b/include/dt-bindings/clock/axis,artpec8-clk.h
new file mode 100644
index 000000000000..1e6e1409dd94
--- /dev/null
+++ b/include/dt-bindings/clock/axis,artpec8-clk.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2025 Samsung Electronics Co., Ltd.
+ *             https://www.samsung.com
+ * Copyright (c) 2025 Axis Communications AB.
+ *             https://www.axis.com
+ *
+ * Device Tree binding constants for ARTPEC-8 clock controller.
+ */
+
+#ifndef _DT_BINDINGS_CLOCK_ARTPEC8_H
+#define _DT_BINDINGS_CLOCK_ARTPEC8_H
+
+/* CMU_CMU */
+#define CLK_FOUT_SHARED0_PLL					1
+#define CLK_DOUT_SHARED0_DIV2					2
+#define CLK_DOUT_SHARED0_DIV3					3
+#define CLK_DOUT_SHARED0_DIV4					4
+#define CLK_FOUT_SHARED1_PLL					5
+#define CLK_DOUT_SHARED1_DIV2					6
+#define CLK_DOUT_SHARED1_DIV3					7
+#define CLK_DOUT_SHARED1_DIV4					8
+#define CLK_FOUT_AUDIO_PLL					9
+#define CLK_DOUT_CMU_BUS					10
+#define CLK_DOUT_CMU_BUS_DLP					11
+#define CLK_DOUT_CMU_CDC_CORE					12
+#define CLK_DOUT_CMU_OTP					13
+#define CLK_DOUT_CMU_CORE_MAIN					14
+#define CLK_DOUT_CMU_CORE_DLP					15
+#define CLK_DOUT_CMU_CPUCL_SWITCH				16
+#define CLK_DOUT_CMU_DLP_CORE					17
+#define CLK_DOUT_CMU_FSYS_BUS					18
+#define CLK_DOUT_CMU_FSYS_IP					19
+#define CLK_DOUT_CMU_FSYS_SCAN0					20
+#define CLK_DOUT_CMU_FSYS_SCAN1					21
+#define CLK_DOUT_CMU_GPU_3D					22
+#define CLK_DOUT_CMU_GPU_2D					23
+#define CLK_DOUT_CMU_IMEM_ACLK					24
+#define CLK_DOUT_CMU_IMEM_JPEG					25
+#define CLK_DOUT_CMU_MIF_SWITCH					26
+#define CLK_DOUT_CMU_MIF_BUSP					27
+#define CLK_DOUT_CMU_PERI_DISP					28
+#define CLK_DOUT_CMU_PERI_IP					29
+#define CLK_DOUT_CMU_PERI_AUDIO					30
+#define CLK_DOUT_CMU_RSP_CORE					31
+#define CLK_DOUT_CMU_TRFM_CORE					32
+#define CLK_DOUT_CMU_VCA_ACE					33
+#define CLK_DOUT_CMU_VCA_OD					34
+#define CLK_DOUT_CMU_VIO_CORE					35
+#define CLK_DOUT_CMU_VIO_AUDIO					36
+#define CLK_DOUT_CMU_VIP0_CORE					37
+#define CLK_DOUT_CMU_VIP1_CORE					38
+#define CLK_DOUT_CMU_VPP_CORE					39
+
+/* CMU_BUS */
+#define CLK_MOUT_BUS_ACLK_USER					1
+#define CLK_MOUT_BUS_DLP_USER					2
+#define CLK_DOUT_BUS_PCLK					3
+
+/* CMU_CORE */
+#define CLK_MOUT_CORE_ACLK_USER					1
+#define CLK_MOUT_CORE_DLP_USER					2
+#define CLK_DOUT_CORE_PCLK					3
+
+/* CMU_CPUCL */
+#define CLK_FOUT_CPUCL_PLL					1
+#define CLK_MOUT_CPUCL_PLL					2
+#define CLK_MOUT_CPUCL_SWITCH_USER				3
+#define CLK_DOUT_CPUCL_CPU					4
+#define CLK_DOUT_CPUCL_CLUSTER_ACLK				5
+#define CLK_DOUT_CPUCL_CLUSTER_PCLKDBG				6
+#define CLK_DOUT_CPUCL_CLUSTER_CNTCLK				7
+#define CLK_DOUT_CPUCL_CLUSTER_ATCLK				8
+#define CLK_DOUT_CPUCL_PCLK					9
+#define CLK_DOUT_CPUCL_CMUREF					10
+#define CLK_DOUT_CPUCL_DBG					11
+#define CLK_DOUT_CPUCL_PCLKDBG					12
+#define CLK_GOUT_CPUCL_CLUSTER_CPU				13
+#define CLK_GOUT_CPUCL_SHORTSTOP				14
+#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_PCLKDBG			15
+#define CLK_GOUT_CPUCL_CSSYS_IPCLKPORT_ATCLK			16
+
+/* CMU_FSYS */
+#define CLK_FOUT_FSYS_PLL					1
+#define CLK_MOUT_FSYS_SCAN0_USER				2
+#define CLK_MOUT_FSYS_SCAN1_USER				3
+#define CLK_MOUT_FSYS_BUS_USER					4
+#define CLK_MOUT_FSYS_MMC_USER					5
+#define CLK_DOUT_FSYS_PCIE_PIPE					6
+#define CLK_DOUT_FSYS_ADC					7
+#define CLK_DOUT_FSYS_PCIE_PHY_REFCLK_SYSPLL			8
+#define CLK_DOUT_FSYS_EQOS_INT125				9
+#define CLK_DOUT_FSYS_OTP_MEM					10
+#define CLK_DOUT_FSYS_SCLK_UART					11
+#define CLK_DOUT_FSYS_EQOS_25					12
+#define CLK_DOUT_FSYS_EQOS_2p5					13
+#define CLK_DOUT_FSYS_BUS300					14
+#define CLK_DOUT_FSYS_BUS_QSPI					15
+#define CLK_DOUT_FSYS_MMC_CARD0					16
+#define CLK_DOUT_FSYS_MMC_CARD1					17
+#define CLK_DOUT_SCAN_CLK_FSYS_125				18
+#define CLK_DOUT_FSYS_QSPI					19
+#define CLK_DOUT_FSYS_SFMC_NAND					20
+#define CLK_DOUT_FSYS_SCAN_CLK_MMC				21
+#define CLK_GOUT_FSYS_USB20DRD_IPCLKPORT_ACLK_PHYCTRL_20	22
+#define CLK_GOUT_FSYS_USB20DRD_IPCLKPORT_BUS_CLK_EARLY		23
+#define CLK_GOUT_FSYS_XHB_USB_IPCLKPORT_CLK			24
+#define CLK_GOUT_FSYS_XHB_AHBBR_IPCLKPORT_CLK			25
+#define CLK_GOUT_FSYS_I2C0_IPCLKPORT_I_PCLK			26
+#define CLK_GOUT_FSYS_I2C1_IPCLKPORT_I_PCLK			27
+#define CLK_GOUT_FSYS_PWM_IPCLKPORT_I_PCLK_S0			28
+#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INST_0_MSTR_ACLK_UG		29
+#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INXT_0_SLV_ACLK_UG		30
+#define CLK_GOUT_FSYS_DWC_PCIE_CTL_INST_0_DBI_ACLK_UG		31
+#define CLK_GOUT_FSYS_PIPE_PAL_INST_0_I_APB_PCLK		32
+#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_ACLK_I			33
+#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_CLK_CSR_I		34
+#define CLK_GOUT_FSYS_EQOS_TOP_IPCLKPORT_I_RGMII_TXCLK_2P5	35
+#define CLK_GOUT_FSYS_SFMC_IPCLKPORT_I_ACLK_NAND		36
+#define CLK_GOUT_FSYS_MMC0_IPCLKPORT_SDCLKIN			37
+#define CLK_GOUT_FSYS_MMC0_IPCLKPORT_I_ACLK			38
+#define CLK_GOUT_FSYS_MMC1_IPCLKPORT_SDCLKIN			39
+#define CLK_GOUT_FSYS_MMC1_IPCLKPORT_I_ACLK			40
+#define CLK_GOUT_FSYS_PCIE_PHY_REFCLK_IN			41
+#define CLK_GOUT_FSYS_UART0_PCLK				42
+#define CLK_GOUT_FSYS_UART0_SCLK_UART				43
+#define CLK_GOUT_FSYS_BUS_QSPI					44
+#define CLK_GOUT_FSYS_QSPI_IPCLKPORT_HCLK			45
+#define CLK_GOUT_FSYS_QSPI_IPCLKPORT_SSI_CLK			46
+
+/* CMU_IMEM */
+#define CLK_MOUT_IMEM_ACLK_USER					1
+#define CLK_MOUT_IMEM_GIC_CA53					2
+#define CLK_MOUT_IMEM_GIC_CA5					3
+#define CLK_MOUT_IMEM_JPEG_USER					4
+#define CLK_GOUT_IMEM_MCT_PCLK					5
+#define CLK_GOUT_IMEM_PCLK_TMU0_APBIF				6
+
+/* CMU_PERI */
+#define CLK_MOUT_PERI_IP_USER					1
+#define CLK_MOUT_PERI_AUDIO_USER				2
+#define CLK_MOUT_PERI_I2S0					3
+#define CLK_MOUT_PERI_I2S1					4
+#define CLK_MOUT_PERI_DISP_USER					5
+#define CLK_DOUT_PERI_SPI					6
+#define CLK_DOUT_PERI_UART1					7
+#define CLK_DOUT_PERI_UART2					8
+#define CLK_DOUT_PERI_PCLK					9
+#define CLK_DOUT_PERI_I2S0					10
+#define CLK_DOUT_PERI_I2S1					11
+#define CLK_DOUT_PERI_DSIM					12
+#define CLK_GOUT_PERI_UART1_PCLK				13
+#define CLK_GOUT_PERI_UART1_SCLK_UART				14
+#define CLK_GOUT_PERI_UART2_PCLK				15
+#define CLK_GOUT_PERI_UART2_SCLK_UART				16
+#define CLK_GOUT_PERI_I2C2_IPCLKPORT_I_PCLK			17
+#define CLK_GOUT_PERI_I2C3_IPCLKPORT_I_PCLK			18
+#define CLK_GOUT_PERI_SPI0_PCLK					19
+#define CLK_GOUT_PERI_SPI0_SCLK_SPI				20
+#define CLK_GOUT_PERI_APB_ASYNC_DSIM_IPCLKPORT_PCLKS		21
+#define CLK_GOUT_PERI_I2SSC0_IPCLKPORT_CLK_HST			22
+#define CLK_GOUT_PERI_I2SSC1_IPCLKPORT_CLK_HST			23
+#define CLK_GOUT_PERI_AUDIO_OUT_IPCLKPORT_CLK			24
+#define CLK_GOUT_PERI_I2SSC0_IPCLKPORT_CLK			25
+#define CLK_GOUT_PERI_I2SSC1_IPCLKPORT_CLK			26
+#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_APB_CLK		27
+#define CLK_GOUT_PERI_DMA4DSIM_IPCLKPORT_CLK_AXI_CLK		28
+
+#endif /* _DT_BINDINGS_CLOCK_ARTPEC8_H */
diff --git a/include/linux/acpi_rimt.h b/include/linux/acpi_rimt.h
new file mode 100644
index 000000000000..fad3adc4d899
--- /dev/null
+++ b/include/linux/acpi_rimt.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Copyright (C) 2024-2025, Ventana Micro Systems Inc.
+ *	Author: Sunil V L <sunilvl@ventanamicro.com>
+ */
+
+#ifndef _ACPI_RIMT_H
+#define _ACPI_RIMT_H
+
+#ifdef CONFIG_ACPI_RIMT
+int rimt_iommu_register(struct device *dev);
+#else
+static inline int rimt_iommu_register(struct device *dev)
+{
+	return -ENODEV;
+}
+#endif
+
+#if defined(CONFIG_IOMMU_API) && defined(CONFIG_ACPI_RIMT)
+int rimt_iommu_configure_id(struct device *dev, const u32 *id_in);
+#else
+static inline int rimt_iommu_configure_id(struct device *dev, const u32 *id_in)
+{
+	return -ENODEV;
+}
+#endif
+
+#endif /* _ACPI_RIMT_H */
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index f3bc0bcd7098..c249912456f9 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -149,7 +149,5 @@ void dma_direct_free_pages(struct device *dev, size_t size,
 		struct page *page, dma_addr_t dma_addr,
 		enum dma_data_direction dir);
 int dma_direct_supported(struct device *dev, u64 mask);
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 
 #endif /* _LINUX_DMA_DIRECT_H */
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 332b80c42b6f..10882d00cb17 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -395,15 +395,15 @@ void *arch_dma_set_uncached(void *addr, size_t size);
 void arch_dma_clear_uncached(void *addr, size_t size);
 
 #ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
-bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr);
-bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle);
+bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr);
+bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle);
 bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
 		int nents);
 bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
 		int nents);
 #else
-#define arch_dma_map_page_direct(d, a)		(false)
-#define arch_dma_unmap_page_direct(d, a)	(false)
+#define arch_dma_map_phys_direct(d, a)		(false)
+#define arch_dma_unmap_phys_direct(d, a)	(false)
 #define arch_dma_map_sg_direct(d, s, n)		(false)
 #define arch_dma_unmap_sg_direct(d, s, n)	(false)
 #endif
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 55c03e5fe8cb..8248ff9363ee 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -59,6 +59,26 @@
 #define DMA_ATTR_PRIVILEGED		(1UL << 9)
 
 /*
+ * DMA_ATTR_MMIO - Indicates memory-mapped I/O (MMIO) region for DMA mapping
+ *
+ * This attribute indicates the physical address is not normal system
+ * memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page()
+ * functions, it may not be cacheable, and access using CPU load/store
+ * instructions may not be allowed.
+ *
+ * Usually this will be used to describe MMIO addresses, or other non-cacheable
+ * register addresses. When DMA mapping this sort of address we call
+ * the operation Peer to Peer as a one device is DMA'ing to another device.
+ * For PCI devices the p2pdma APIs must be used to determine if DMA_ATTR_MMIO
+ * is appropriate.
+ *
+ * For architectures that require cache flushing for DMA coherence
+ * DMA_ATTR_MMIO will not perform any cache flushing. The address
+ * provided must never be mapped cacheable into the CPU.
+ */
+#define DMA_ATTR_MMIO		(1UL << 10)
+
+/*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
  * be given to a device to use as a DMA source or target.  It is specific to a
  * given device and there may be a translation between the CPU physical address
@@ -118,6 +138,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 		unsigned long attrs);
 void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs);
+dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
 unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 		int nents, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
@@ -172,6 +196,15 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
+static inline dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+	return DMA_MAPPING_ERROR;
+}
+static inline void dma_unmap_phys(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+}
 static inline unsigned int dma_map_sg_attrs(struct device *dev,
 		struct scatterlist *sg, int nents, enum dma_data_direction dir,
 		unsigned long attrs)
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 3aac58a520c7..d0cf10d5e0f7 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -276,7 +276,7 @@ struct export_operations {
 	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
 			     int nr_iomaps, struct iattr *iattr);
 	int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
-	struct file * (*open)(struct path *path, unsigned int oflags);
+	struct file * (*open)(const struct path *path, unsigned int oflags);
 #define	EXPORT_OP_NOWCC			(0x1) /* don't collect v3 wcc data */
 #define	EXPORT_OP_NOSUBTREECHK		(0x2) /* no subtree checking */
 #define	EXPORT_OP_CLOSE_BEFORE_UNLINK	(0x4) /* close files before unlink */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1f4a1c570a2a..540004970ad5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1192,6 +1192,8 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_cred: stashed credentials of creator/opener
  * @f_owner: file owner
  * @f_path: path of the file
+ * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before
+ *   the file gets open
  * @f_pos_lock: lock protecting file position
  * @f_pipe: specific to pipes
  * @f_pos: file position
@@ -1217,7 +1219,10 @@ struct file {
 	const struct cred		*f_cred;
 	struct fown_struct		*f_owner;
 	/* --- cacheline 1 boundary (64 bytes) --- */
-	struct path			f_path;
+	union {
+		const struct path	f_path;
+		struct path		__f_path;
+	};
 	union {
 		/* regular files (with FMODE_ATOMIC_POS) and directories */
 		struct mutex		f_pos_lock;
@@ -2877,7 +2882,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags,
 				  const struct cred *cred);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct path *backing_file_user_path(const struct file *f);
+const struct path *backing_file_user_path(const struct file *f);
 
 /*
  * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 138fbd89b1e6..8a823c6f2b4a 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -180,6 +180,7 @@ struct io_pgtable_cfg {
 		struct {
 			u64 ttbr[4];
 			u32 n_ttbrs;
+			u32 n_levels;
 		} apple_dart_cfg;
 
 		struct {
diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h
index 508beaa44c39..a92b3ff9b934 100644
--- a/include/linux/iommu-dma.h
+++ b/include/linux/iommu-dma.h
@@ -21,10 +21,9 @@ static inline bool use_dma_iommu(struct device *dev)
 }
 #endif /* CONFIG_IOMMU_DMA */
 
-dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
-		unsigned long offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs);
-void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+		enum dma_data_direction dir, unsigned long attrs);
+void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		enum dma_data_direction dir, unsigned long attrs);
@@ -43,10 +42,6 @@ size_t iommu_dma_opt_mapping_size(void);
 size_t iommu_dma_max_mapping_size(struct device *dev);
 void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t handle, unsigned long attrs);
-dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
-void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
-		size_t size, enum dma_data_direction dir, unsigned long attrs);
 struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size,
 		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
 void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index 2b1432cc16d5..f2fd221107bb 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -182,8 +182,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
 
 /**
  * kmsan_handle_dma() - Handle a DMA data transfer.
- * @page:   first page of the buffer.
- * @offset: offset of the buffer within the first page.
+ * @phys:   physical address of the buffer.
  * @size:   buffer size.
  * @dir:    one of possible dma_data_direction values.
  *
@@ -192,7 +191,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end);
  * * initializes the buffer, if it is copied from device;
  * * does both, if this is a DMA_BIDIRECTIONAL transfer.
  */
-void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+void kmsan_handle_dma(phys_addr_t phys, size_t size,
 		      enum dma_data_direction dir);
 
 /**
@@ -372,8 +371,8 @@ static inline void kmsan_iounmap_page_range(unsigned long start,
 {
 }
 
-static inline void kmsan_handle_dma(struct page *page, size_t offset,
-				    size_t size, enum dma_data_direction dir)
+static inline void kmsan_handle_dma(phys_addr_t phys, size_t size,
+				    enum dma_data_direction dir)
 {
 }
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a7800ef04e76..fed86221c69c 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -61,10 +61,10 @@ struct dentry *kern_path_parent(const char *name, struct path *parent);
 
 extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int);
 extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int);
-extern void end_creating_path(struct path *, struct dentry *);
+extern void end_creating_path(const struct path *, struct dentry *);
 extern struct dentry *start_removing_path(const char *, struct path *);
 extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *);
-static inline void end_removing_path(struct path *path , struct dentry *dentry)
+static inline void end_removing_path(const struct path *path , struct dentry *dentry)
 {
 	end_creating_path(path, dentry);
 }
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 9aed39abc94b..afe1d8f09d89 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -122,8 +122,6 @@ struct nfs_pageio_descriptor {
 /* arbitrarily selected limit to number of mirrors */
 #define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16
 
-#define NFS_WBACK_BUSY(req)	(test_bit(PG_BUSY,&(req)->wb_flags))
-
 extern struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx,
 						  struct page *page,
 						  unsigned int pgbase,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index ac4bff6e9913..d56583572c98 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -862,7 +862,7 @@ struct nfs_getaclres {
 	size_t				acl_len;
 	size_t				acl_data_offset;
 	int				acl_flags;
-	struct page *			acl_scratch;
+	struct folio *			acl_scratch;
 };
 
 struct nfs_setattrres {
@@ -1596,7 +1596,7 @@ struct nfs42_listxattrsargs {
 
 struct nfs42_listxattrsres {
 	struct nfs4_sequence_res	seq_res;
-	struct page			*scratch;
+	struct folio			*scratch;
 	void				*xattr_buf;
 	size_t				xattr_len;
 	u64				cookie;
diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h
index 5c7c92659e73..7ca2715edccc 100644
--- a/include/linux/nfslocalio.h
+++ b/include/linux/nfslocalio.h
@@ -65,6 +65,8 @@ struct nfsd_localio_operations {
 	struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **);
 	struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *);
 	struct file *(*nfsd_file_file)(struct nfsd_file *);
+	void (*nfsd_file_dio_alignment)(struct nfsd_file *,
+					u32 *, u32 *, u32 *);
 } ____cacheline_aligned;
 
 extern void nfsd_localio_ops_init(void);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 48e27768e7ba..0091ad1986bf 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -618,6 +618,7 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
 #else
 PAGEFLAG_FALSE(HighMem, highmem)
 #endif
+#define PhysHighMem(__p) (PageHighMem(phys_to_page(__p)))
 
 /* Does kmap_local_folio() only allow access to one page of the folio? */
 #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 185644e288ea..09b581c1d878 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1229,6 +1229,8 @@ void folio_wait_writeback(struct folio *folio);
 int folio_wait_writeback_killable(struct folio *folio);
 void end_page_writeback(struct page *page);
 void folio_end_writeback(struct folio *folio);
+void folio_end_writeback_no_dropbehind(struct folio *folio);
+void folio_end_dropbehind(struct folio *folio);
 void folio_wait_stable(struct folio *folio);
 void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
 void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index f6aeed07fe04..891f6173c951 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -23,43 +23,30 @@ extern unsigned int		nlm_debug;
 
 #define dprintk(fmt, ...)						\
 	dfprintk(FACILITY, fmt, ##__VA_ARGS__)
-#define dprintk_cont(fmt, ...)						\
-	dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__)
 #define dprintk_rcu(fmt, ...)						\
 	dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__)
-#define dprintk_rcu_cont(fmt, ...)					\
-	dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__)
 
 #undef ifdebug
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 # define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
 
-# define dfprintk(fac, fmt, ...)					\
-do {									\
-	ifdebug(fac)							\
-		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
-} while (0)
+# if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE)
+#  define __sunrpc_printk(fmt, ...)	trace_printk(fmt, ##__VA_ARGS__)
+# else
+#  define __sunrpc_printk(fmt, ...)	printk(KERN_DEFAULT fmt, ##__VA_ARGS__)
+# endif
 
-# define dfprintk_cont(fac, fmt, ...)					\
+# define dfprintk(fac, fmt, ...)					\
 do {									\
 	ifdebug(fac)							\
-		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+		__sunrpc_printk(fmt, ##__VA_ARGS__);			\
 } while (0)
 
 # define dfprintk_rcu(fac, fmt, ...)					\
 do {									\
 	ifdebug(fac) {							\
 		rcu_read_lock();					\
-		printk(KERN_DEFAULT fmt, ##__VA_ARGS__);		\
-		rcu_read_unlock();					\
-	}								\
-} while (0)
-
-# define dfprintk_rcu_cont(fac, fmt, ...)				\
-do {									\
-	ifdebug(fac) {							\
-		rcu_read_lock();					\
-		printk(KERN_CONT fmt, ##__VA_ARGS__);			\
+		__sunrpc_printk(fmt, ##__VA_ARGS__);			\
 		rcu_read_unlock();					\
 	}								\
 } while (0)
@@ -68,7 +55,6 @@ do {									\
 #else
 # define ifdebug(fac)		if (0)
 # define dfprintk(fac, fmt, ...)	do {} while (0)
-# define dfprintk_cont(fac, fmt, ...)	do {} while (0)
 # define dfprintk_rcu(fac, fmt, ...)	do {} while (0)
 # define RPC_IFDEBUG(x)
 #endif
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 40cbe81360ed..5506d20857c3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -196,7 +196,7 @@ struct svc_rqst {
 	struct xdr_buf		rq_arg;
 	struct xdr_stream	rq_arg_stream;
 	struct xdr_stream	rq_res_stream;
-	struct page		*rq_scratch_page;
+	struct folio		*rq_scratch_folio;
 	struct xdr_buf		rq_res;
 	unsigned long		rq_maxpages;	/* num of entries in rq_pages */
 	struct page *		*rq_pages;
@@ -503,7 +503,7 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
 	buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len;
 
 	xdr_init_decode(xdr, buf, argv->iov_base, NULL);
-	xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
+	xdr_set_scratch_folio(xdr, rqstp->rq_scratch_folio);
 }
 
 /**
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 369a89aea186..fde60d4e2cd5 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -165,7 +165,8 @@ int	svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
 			struct net *net, const int family,
 			const unsigned short port, int flags,
 			const struct cred *cred);
-void	svc_xprt_destroy_all(struct svc_serv *serv, struct net *net);
+void	svc_xprt_destroy_all(struct svc_serv *serv, struct net *net,
+			     bool unregister);
 void	svc_xprt_received(struct svc_xprt *xprt);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
 void	svc_xprt_put(struct svc_xprt *xprt);
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8a9ec617cf66..49278749ad0c 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -288,16 +288,16 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
 }
 
 /**
- * xdr_set_scratch_page - Attach a scratch buffer for decoding data
+ * xdr_set_scratch_folio - Attach a scratch buffer for decoding data
  * @xdr: pointer to xdr_stream struct
- * @page: an anonymous page
+ * @page: an anonymous folio
  *
  * See xdr_set_scratch_buffer().
  */
 static inline void
-xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page)
+xdr_set_scratch_folio(struct xdr_stream *xdr, struct folio *folio)
 {
-	xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE);
+	xdr_set_scratch_buffer(xdr, folio_address(folio), folio_size(folio));
 }
 
 /**
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 3f1b58d8b4bf..8bd0e1eb393b 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -48,6 +48,7 @@
 #define IB_MGMT_METHOD_REPORT			0x06
 #define IB_MGMT_METHOD_REPORT_RESP		0x86
 #define IB_MGMT_METHOD_TRAP_REPRESS		0x07
+#define IB_MGMT_METHOD_GET_TABLE		0x12
 
 #define IB_MGMT_METHOD_RESP			0x80
 #define IB_BM_ATTR_MOD_RESP			cpu_to_be32(1)
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index b46353fc53bf..95e8924ad563 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -189,6 +189,20 @@ struct sa_path_rec {
 	u32 flags;
 };
 
+struct sa_service_rec {
+	__be64 id;
+	__u8 gid[16];
+	__be16 pkey;
+	__u8 reserved[2];
+	__be32 lease;
+	__u8 key[16];
+	__u8 name[64];
+	__u8 data_8[16];
+	__be16 data_16[8];
+	__be32 data_32[4];
+	__be64 data_64[2];
+};
+
 static inline enum ib_gid_type
 		sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec)
 {
@@ -417,6 +431,17 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
 					unsigned int num_prs, void *context),
 		       void *context, struct ib_sa_query **query);
 
+int ib_sa_service_rec_get(struct ib_sa_client *client,
+			  struct ib_device *device, u32 port_num,
+			  struct sa_service_rec *rec,
+			  ib_sa_comp_mask comp_mask,
+			  unsigned long timeout_ms, gfp_t gfp_mask,
+			  void (*callback)(int status,
+					   struct sa_service_rec *resp,
+					   unsigned int num_services,
+					   void *context),
+			  void *context, struct ib_sa_query **sa_query);
+
 struct ib_sa_multicast {
 	struct ib_sa_mcmember_rec rec;
 	ib_sa_comp_mask		comp_mask;
@@ -509,6 +534,18 @@ int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num,
 void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute);
 
 /**
+ * ib_sa_pack_service - Convert a service record from struct ib_sa_service_rec
+ * to IB MAD wire format.
+ */
+void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute);
+
+/**
+ * ib_sa_unpack_service - Convert a service record from MAD format to struct
+ * ib_sa_service_rec.
+ */
+void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec);
+
+/**
  * ib_sa_unpack_path - Convert a path record from MAD format to struct
  * ib_sa_path_rec.
  */
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index d1593ad47e28..9bd930a83e6e 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -33,7 +33,11 @@ enum rdma_cm_event_type {
 	RDMA_CM_EVENT_MULTICAST_JOIN,
 	RDMA_CM_EVENT_MULTICAST_ERROR,
 	RDMA_CM_EVENT_ADDR_CHANGE,
-	RDMA_CM_EVENT_TIMEWAIT_EXIT
+	RDMA_CM_EVENT_TIMEWAIT_EXIT,
+	RDMA_CM_EVENT_ADDRINFO_RESOLVED,
+	RDMA_CM_EVENT_ADDRINFO_ERROR,
+	RDMA_CM_EVENT_USER,
+	RDMA_CM_EVENT_INTERNAL,
 };
 
 const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event);
@@ -63,6 +67,9 @@ struct rdma_route {
 	 * 2 - Both primary and alternate path are available
 	 */
 	int num_pri_alt_paths;
+
+	unsigned int num_service_recs;
+	struct sa_service_rec *service_recs;
 };
 
 struct rdma_conn_param {
@@ -93,6 +100,7 @@ struct rdma_cm_event {
 	union {
 		struct rdma_conn_param	conn;
 		struct rdma_ud_param	ud;
+		u64			arg;
 	} param;
 	struct rdma_ucm_ece ece;
 };
@@ -198,6 +206,17 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms);
 
 /**
+ * rdma_resolve_ib_service - Resolve the IB service record of the
+ *   service with the given service ID or name.
+ *
+ * This function is optional in the rdma cm flow. It is called on the client
+ * side of a connection, before calling rdma_resolve_route. The resolution
+ * can be done once per rdma_cm_id.
+ */
+int rdma_resolve_ib_service(struct rdma_cm_id *id,
+			    struct rdma_ucm_ib_service *ibs);
+
+/**
  * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
  * identifier.
  *
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index 9c6e90829dbd..a0635b128d7a 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -203,6 +203,14 @@ static inline bool dev_is_expander(enum sas_device_type type)
 	       type == SAS_FANOUT_EXPANDER_DEVICE;
 }
 
+static inline bool dev_parent_is_expander(struct domain_device *dev)
+{
+	if (!dev->parent)
+		return false;
+
+	return dev_is_expander(dev->parent->dev_type);
+}
+
 static inline void INIT_SAS_WORK(struct sas_work *sw, void (*fn)(struct work_struct *))
 {
 	INIT_WORK(&sw->work, fn);
diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index d8ddc27b6a7c..5da59fd8121d 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -31,7 +31,8 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 		{ DMA_ATTR_FORCE_CONTIGUOUS, "FORCE_CONTIGUOUS" }, \
 		{ DMA_ATTR_ALLOC_SINGLE_PAGES, "ALLOC_SINGLE_PAGES" }, \
 		{ DMA_ATTR_NO_WARN, "NO_WARN" }, \
-		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" })
+		{ DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \
+		{ DMA_ATTR_MMIO, "MMIO" })
 
 DECLARE_EVENT_CLASS(dma_map,
 	TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr,
@@ -71,8 +72,7 @@ DEFINE_EVENT(dma_map, name, \
 		 size_t size, enum dma_data_direction dir, unsigned long attrs), \
 	TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs))
 
-DEFINE_MAP_EVENT(dma_map_page);
-DEFINE_MAP_EVENT(dma_map_resource);
+DEFINE_MAP_EVENT(dma_map_phys);
 
 DECLARE_EVENT_CLASS(dma_unmap,
 	TP_PROTO(struct device *dev, dma_addr_t addr, size_t size,
@@ -109,8 +109,7 @@ DEFINE_EVENT(dma_unmap, name, \
 		 enum dma_data_direction dir, unsigned long attrs), \
 	TP_ARGS(dev, addr, size, dir, attrs))
 
-DEFINE_UNMAP_EVENT(dma_unmap_page);
-DEFINE_UNMAP_EVENT(dma_unmap_resource);
+DEFINE_UNMAP_EVENT(dma_unmap_phys);
 
 DECLARE_EVENT_CLASS(dma_alloc_class,
 	TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr,
diff --git a/include/trace/misc/fs.h b/include/trace/misc/fs.h
index 0406ebe2a80a..7ead1c61f0cb 100644
--- a/include/trace/misc/fs.h
+++ b/include/trace/misc/fs.h
@@ -141,3 +141,25 @@
 		{ ATTR_TIMES_SET,	"TIMES_SET" },	\
 		{ ATTR_TOUCH,		"TOUCH"},	\
 		{ ATTR_DELEG,		"DELEG"})
+
+#define show_statx_mask(flags)					\
+	__print_flags(flags, "|",				\
+		{ STATX_TYPE,		"TYPE" },		\
+		{ STATX_MODE,		"MODE" },		\
+		{ STATX_NLINK,		"NLINK" },		\
+		{ STATX_UID,		"UID" },		\
+		{ STATX_GID,		"GID" },		\
+		{ STATX_ATIME,		"ATIME" },		\
+		{ STATX_MTIME,		"MTIME" },		\
+		{ STATX_CTIME,		"CTIME" },		\
+		{ STATX_INO,		"INO" },		\
+		{ STATX_SIZE,		"SIZE" },		\
+		{ STATX_BLOCKS,		"BLOCKS" },		\
+		{ STATX_BASIC_STATS,	"BASIC_STATS" },	\
+		{ STATX_BTIME,		"BTIME" },		\
+		{ STATX_MNT_ID,		"MNT_ID" },		\
+		{ STATX_DIOALIGN,	"DIOALIGN" },		\
+		{ STATX_MNT_ID_UNIQUE,	"MNT_ID_UNIQUE" },	\
+		{ STATX_SUBVOL,		"SUBVOL" },		\
+		{ STATX_WRITE_ATOMIC,	"WRITE_ATOMIC" },	\
+		{ STATX_DIO_READ_ALIGN,	"DIO_READ_ALIGN" })
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index fe15bc7e9f70..89e6a3f13191 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -255,6 +255,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_SIW,
 	RDMA_DRIVER_ERDMA,
 	RDMA_DRIVER_MANA,
+	RDMA_DRIVER_IONIC,
 };
 
 enum ib_uverbs_gid_type {
diff --git a/include/uapi/rdma/ib_user_sa.h b/include/uapi/rdma/ib_user_sa.h
index 435155d6e1c6..acfa20816bc6 100644
--- a/include/uapi/rdma/ib_user_sa.h
+++ b/include/uapi/rdma/ib_user_sa.h
@@ -74,4 +74,18 @@ struct ib_user_path_rec {
 	__u8	preference;
 };
 
+struct ib_user_service_rec {
+	__be64	id;
+	__u8	gid[16];
+	__be16	pkey;
+	__u8	reserved[2];
+	__be32	lease;
+	__u8	key[16];
+	__u8	name[64];
+	__u8	data_8[16];
+	__be16	data_16[8];
+	__be32	data_32[4];
+	__be64	data_64[2];
+};
+
 #endif /* IB_USER_SA_H */
diff --git a/include/uapi/rdma/ionic-abi.h b/include/uapi/rdma/ionic-abi.h
new file mode 100644
index 000000000000..7b589d3e9728
--- /dev/null
+++ b/include/uapi/rdma/ionic-abi.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (C) 2018-2025, Advanced Micro Devices, Inc */
+
+#ifndef IONIC_ABI_H
+#define IONIC_ABI_H
+
+#include <linux/types.h>
+
+#define IONIC_ABI_VERSION	1
+
+#define IONIC_EXPDB_64		1
+#define IONIC_EXPDB_128		2
+#define IONIC_EXPDB_256		4
+#define IONIC_EXPDB_512		8
+
+#define IONIC_EXPDB_SQ		1
+#define IONIC_EXPDB_RQ		2
+
+#define IONIC_CMB_ENABLE	1
+#define IONIC_CMB_REQUIRE	2
+#define IONIC_CMB_EXPDB		4
+#define IONIC_CMB_WC		8
+#define IONIC_CMB_UC		16
+
+struct ionic_ctx_req {
+	__u32 rsvd[2];
+};
+
+struct ionic_ctx_resp {
+	__u32 rsvd;
+	__u32 page_shift;
+
+	__aligned_u64 dbell_offset;
+
+	__u16 version;
+	__u8 qp_opcodes;
+	__u8 admin_opcodes;
+
+	__u8 sq_qtype;
+	__u8 rq_qtype;
+	__u8 cq_qtype;
+	__u8 admin_qtype;
+
+	__u8 max_stride;
+	__u8 max_spec;
+	__u8 udma_count;
+	__u8 expdb_mask;
+	__u8 expdb_qtypes;
+
+	__u8 rsvd2[3];
+};
+
+struct ionic_qdesc {
+	__aligned_u64 addr;
+	__u32 size;
+	__u16 mask;
+	__u8 depth_log2;
+	__u8 stride_log2;
+};
+
+struct ionic_ah_resp {
+	__u32 ahid;
+	__u32 pad;
+};
+
+struct ionic_cq_req {
+	struct ionic_qdesc cq[2];
+	__u8 udma_mask;
+	__u8 rsvd[7];
+};
+
+struct ionic_cq_resp {
+	__u32 cqid[2];
+	__u8 udma_mask;
+	__u8 rsvd[7];
+};
+
+struct ionic_qp_req {
+	struct ionic_qdesc sq;
+	struct ionic_qdesc rq;
+	__u8 sq_spec;
+	__u8 rq_spec;
+	__u8 sq_cmb;
+	__u8 rq_cmb;
+	__u8 udma_mask;
+	__u8 rsvd[3];
+};
+
+struct ionic_qp_resp {
+	__u32 qpid;
+	__u8 sq_cmb;
+	__u8 rq_cmb;
+	__u8 udma_idx;
+	__u8 rsvd[1];
+	__aligned_u64 sq_cmb_offset;
+	__aligned_u64 rq_cmb_offset;
+};
+
+struct ionic_srq_req {
+	struct ionic_qdesc rq;
+	__u8 rq_spec;
+	__u8 rq_cmb;
+	__u8 udma_mask;
+	__u8 rsvd[5];
+};
+
+struct ionic_srq_resp {
+	__u32 qpid;
+	__u8 rq_cmb;
+	__u8 udma_idx;
+	__u8 rsvd[2];
+	__aligned_u64 rq_cmb_offset;
+};
+
+#endif /* IONIC_ABI_H */
diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h
index bb18f15489e3..f7788d33376b 100644
--- a/include/uapi/rdma/irdma-abi.h
+++ b/include/uapi/rdma/irdma-abi.h
@@ -20,11 +20,14 @@ enum irdma_memreg_type {
 	IRDMA_MEMREG_TYPE_MEM  = 0,
 	IRDMA_MEMREG_TYPE_QP   = 1,
 	IRDMA_MEMREG_TYPE_CQ   = 2,
+	IRDMA_MEMREG_TYPE_SRQ  = 3,
 };
 
 enum {
 	IRDMA_ALLOC_UCTX_USE_RAW_ATTR = 1 << 0,
 	IRDMA_ALLOC_UCTX_MIN_HW_WQ_SIZE = 1 << 1,
+	IRDMA_ALLOC_UCTX_MAX_HW_SRQ_QUANTA = 1 << 2,
+	IRDMA_SUPPORT_WQE_FORMAT_V2 = 1 << 3,
 };
 
 struct irdma_alloc_ucontext_req {
@@ -54,7 +57,8 @@ struct irdma_alloc_ucontext_resp {
 	__u8 rsvd2;
 	__aligned_u64 comp_mask;
 	__u16 min_hw_wq_size;
-	__u8 rsvd3[6];
+	__u32 max_hw_srq_quanta;
+	__u8 rsvd3[2];
 };
 
 struct irdma_alloc_pd_resp {
@@ -71,6 +75,16 @@ struct irdma_create_cq_req {
 	__aligned_u64 user_shadow_area;
 };
 
+struct irdma_create_srq_req {
+	__aligned_u64 user_srq_buf;
+	__aligned_u64 user_shadow_area;
+};
+
+struct irdma_create_srq_resp {
+	__u32 srq_id;
+	__u32 srq_size;
+};
+
 struct irdma_create_qp_req {
 	__aligned_u64 user_wqe_bufs;
 	__aligned_u64 user_compl_ctx;
diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index 7cea03581f79..5ded174687ee 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -67,7 +67,9 @@ enum {
 	RDMA_USER_CM_CMD_QUERY,
 	RDMA_USER_CM_CMD_BIND,
 	RDMA_USER_CM_CMD_RESOLVE_ADDR,
-	RDMA_USER_CM_CMD_JOIN_MCAST
+	RDMA_USER_CM_CMD_JOIN_MCAST,
+	RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE,
+	RDMA_USER_CM_CMD_WRITE_CM_EVENT,
 };
 
 /* See IBTA Annex A11, servies ID bytes 4 & 5 */
@@ -147,7 +149,8 @@ struct rdma_ucm_resolve_route {
 enum {
 	RDMA_USER_CM_QUERY_ADDR,
 	RDMA_USER_CM_QUERY_PATH,
-	RDMA_USER_CM_QUERY_GID
+	RDMA_USER_CM_QUERY_GID,
+	RDMA_USER_CM_QUERY_IB_SERVICE
 };
 
 struct rdma_ucm_query {
@@ -187,6 +190,11 @@ struct rdma_ucm_query_path_resp {
 	struct ib_path_rec_data path_data[];
 };
 
+struct rdma_ucm_query_ib_service_resp {
+	__u32 num_service_recs;
+	struct ib_user_service_rec recs[];
+};
+
 struct rdma_ucm_conn_param {
 	__u32 qp_num;
 	__u32 qkey;
@@ -297,6 +305,7 @@ struct rdma_ucm_event_resp {
 	union {
 		struct rdma_ucm_conn_param conn;
 		struct rdma_ucm_ud_param   ud;
+		__u32 arg32[2];
 	} param;
 	__u32 reserved;
 	struct rdma_ucm_ece ece;
@@ -338,4 +347,33 @@ struct rdma_ucm_migrate_resp {
 	__u32 events_reported;
 };
 
+enum {
+	RDMA_USER_CM_IB_SERVICE_FLAG_ID = 1 << 0,
+	RDMA_USER_CM_IB_SERVICE_FLAG_NAME = 1 << 1,
+};
+
+#define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64
+struct rdma_ucm_ib_service {
+	__u64 service_id;
+	__u8  service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE];
+	__u32 flags;
+	__u32 reserved;
+};
+
+struct rdma_ucm_resolve_ib_service {
+	__u32 id;
+	struct rdma_ucm_ib_service ibs;
+};
+
+struct rdma_ucm_write_cm_event {
+	__u32 id;
+	__u32 reserved;
+	__u32 event;
+	__u32 status;
+	union {
+		struct rdma_ucm_conn_param conn;
+		struct rdma_ucm_ud_param ud;
+		__u64 arg;
+	} param;
+};
 #endif /* RDMA_USER_CM_H */
diff --git a/include/uapi/scsi/fc/fc_els.h b/include/uapi/scsi/fc/fc_els.h
index 16782c360de3..019096beb179 100644
--- a/include/uapi/scsi/fc/fc_els.h
+++ b/include/uapi/scsi/fc/fc_els.h
@@ -11,6 +11,12 @@
 #include <linux/types.h>
 #include <asm/byteorder.h>
 
+#ifdef __KERNEL__
+#include <linux/stddef.h>	/* for offsetof */
+#else
+#include <stddef.h>		/* for offsetof */
+#endif
+
 /*
  * Fibre Channel Switch - Enhanced Link Services definitions.
  * From T11 FC-LS Rev 1.2 June 7, 2005.
@@ -1109,12 +1115,15 @@ struct fc_els_fpin {
 
 /* Diagnostic Function Descriptor - FPIN Registration */
 struct fc_df_desc_fpin_reg {
-	__be32		desc_tag;	/* FPIN Registration (0x00030001) */
-	__be32		desc_len;	/* Length of Descriptor (in bytes).
-					 * Size of descriptor excluding
-					 * desc_tag and desc_len fields.
-					 */
-	__be32		count;		/* Number of desc_tags elements */
+	/* New members MUST be added within the __struct_group() macro below. */
+	__struct_group(fc_df_desc_fpin_reg_hdr, __hdr, /* no attrs */,
+		__be32		desc_tag; /* FPIN Registration (0x00030001) */
+		__be32		desc_len; /* Length of Descriptor (in bytes).
+					   * Size of descriptor excluding
+					   * desc_tag and desc_len fields.
+					   */
+		__be32		count;	  /* Number of desc_tags elements */
+	);
 	__be32		desc_tags[];	/* Array of Descriptor Tags.
 					 * Each tag indicates a function
 					 * supported by the N_Port (request)
@@ -1124,33 +1133,44 @@ struct fc_df_desc_fpin_reg {
 					 * See ELS_FN_DTAG_xxx for tag values.
 					 */
 };
+_Static_assert(offsetof(struct fc_df_desc_fpin_reg, desc_tags) == sizeof(struct fc_df_desc_fpin_reg_hdr),
+	      "struct member likely outside of __struct_group()");
 
 /*
  * ELS_RDF - Register Diagnostic Functions
  */
 struct fc_els_rdf {
-	__u8		fpin_cmd;	/* command (0x19) */
-	__u8		fpin_zero[3];	/* specified as zero - part of cmd */
-	__be32		desc_len;	/* Length of Descriptor List (in bytes).
-					 * Size of ELS excluding fpin_cmd,
-					 * fpin_zero and desc_len fields.
-					 */
+	/* New members MUST be added within the __struct_group() macro below. */
+	__struct_group(fc_els_rdf_hdr, __hdr, /* no attrs */,
+		__u8		fpin_cmd;	/* command (0x19) */
+		__u8		fpin_zero[3];	/* specified as zero - part of cmd */
+		__be32		desc_len;	/* Length of Descriptor List (in bytes).
+						 * Size of ELS excluding fpin_cmd,
+						 * fpin_zero and desc_len fields.
+						 */
+	);
 	struct fc_tlv_desc	desc[];	/* Descriptor list */
 };
+_Static_assert(offsetof(struct fc_els_rdf, desc) == sizeof(struct fc_els_rdf_hdr),
+	       "struct member likely outside of __struct_group()");
 
 /*
  * ELS RDF LS_ACC Response.
  */
 struct fc_els_rdf_resp {
-	struct fc_els_ls_acc	acc_hdr;
-	__be32			desc_list_len;	/* Length of response (in
-						 * bytes). Excludes acc_hdr
-						 * and desc_list_len fields.
-						 */
-	struct fc_els_lsri_desc	lsri;
+	/* New members MUST be added within the __struct_group() macro below. */
+	__struct_group(fc_els_rdf_resp_hdr, __hdr, /* no attrs */,
+		struct fc_els_ls_acc	acc_hdr;
+		__be32			desc_list_len;	/* Length of response (in
+							 * bytes). Excludes acc_hdr
+							 * and desc_list_len fields.
+							 */
+		struct fc_els_lsri_desc	lsri;
+	);
 	struct fc_tlv_desc	desc[];	/* Supported Descriptor list */
 };
-
+_Static_assert(offsetof(struct fc_els_rdf_resp, desc) == sizeof(struct fc_els_rdf_resp_hdr),
+	       "struct member likely outside of __struct_group()");
 
 /*
  * Diagnostic Capability Descriptors for EDC ELS
diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h
index 72fd385037a6..245a6a829ce9 100644
--- a/include/ufs/ufs.h
+++ b/include/ufs/ufs.h
@@ -653,21 +653,4 @@ struct ufs_dev_info {
 	bool hid_sup;
 };
 
-/*
- * This enum is used in string mapping in ufs_trace.h.
- */
-enum ufs_trace_str_t {
-	UFS_CMD_SEND, UFS_CMD_COMP, UFS_DEV_COMP,
-	UFS_QUERY_SEND, UFS_QUERY_COMP, UFS_QUERY_ERR,
-	UFS_TM_SEND, UFS_TM_COMP, UFS_TM_ERR
-};
-
-/*
- * Transaction Specific Fields (TSF) type in the UPIU package, this enum is
- * used in ufs_trace.h for UFS command trace.
- */
-enum ufs_trace_tsf_t {
-	UFS_TSF_CDB, UFS_TSF_OSF, UFS_TSF_TM_INPUT, UFS_TSF_TM_OUTPUT
-};
-
 #endif /* End of Header */
diff --git a/include/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h
index f52de5ed1b3b..83563247c36c 100644
--- a/include/ufs/ufs_quirks.h
+++ b/include/ufs/ufs_quirks.h
@@ -113,4 +113,7 @@ struct ufs_dev_quirk {
  */
 #define UFS_DEVICE_QUIRK_PA_HIBER8TIME          (1 << 12)
 
+/* Some UFS 4 devices do not support the qTimestamp attribute */
+#define UFS_DEVICE_QUIRK_NO_TIMESTAMP_SUPPORT	(1 << 13)
+
 #endif /* UFS_QUIRKS_H_ */
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 1d3943777584..9425cfd9d00e 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -167,13 +167,13 @@ struct ufs_pm_lvl_states {
  * @task_tag: Task tag of the command
  * @lun: LUN of the command
  * @intr_cmd: Interrupt command (doesn't participate in interrupt aggregation)
+ * @req_abort_skip: skip request abort task flag
  * @issue_time_stamp: time stamp for debug purposes (CLOCK_MONOTONIC)
  * @issue_time_stamp_local_clock: time stamp for debug purposes (local_clock)
  * @compl_time_stamp: time stamp for statistics (CLOCK_MONOTONIC)
  * @compl_time_stamp_local_clock: time stamp for debug purposes (local_clock)
  * @crypto_key_slot: the key slot to use for inline crypto (-1 if none)
  * @data_unit_num: the data unit number for the first block for inline crypto
- * @req_abort_skip: skip request abort task flag
  */
 struct ufshcd_lrb {
 	struct utp_transfer_req_desc *utr_descriptor_ptr;
@@ -193,6 +193,7 @@ struct ufshcd_lrb {
 	int task_tag;
 	u8 lun; /* UPIU LUN id field is only 8-bit wide */
 	bool intr_cmd;
+	bool req_abort_skip;
 	ktime_t issue_time_stamp;
 	u64 issue_time_stamp_local_clock;
 	ktime_t compl_time_stamp;
@@ -201,8 +202,6 @@ struct ufshcd_lrb {
 	int crypto_key_slot;
 	u64 data_unit_num;
 #endif
-
-	bool req_abort_skip;
 };
 
 /**
@@ -795,30 +794,6 @@ struct ufs_hba_monitor {
 };
 
 /**
- * struct ufshcd_res_info_t - MCQ related resource regions
- *
- * @name: resource name
- * @resource: pointer to resource region
- * @base: register base address
- */
-struct ufshcd_res_info {
-	const char *name;
-	struct resource *resource;
-	void __iomem *base;
-};
-
-enum ufshcd_res {
-	RES_UFS,
-	RES_MCQ,
-	RES_MCQ_SQD,
-	RES_MCQ_SQIS,
-	RES_MCQ_CQD,
-	RES_MCQ_CQIS,
-	RES_MCQ_VS,
-	RES_MAX,
-};
-
-/**
  * struct ufshcd_mcq_opr_info_t - Operation and Runtime registers
  *
  * @offset: Doorbell Address Offset
@@ -963,6 +938,7 @@ enum ufshcd_mcq_opr {
  * @ufs_rtc_update_work: A work for UFS RTC periodic update
  * @pm_qos_req: PM QoS request handle
  * @pm_qos_enabled: flag to check if pm qos is enabled
+ * @pm_qos_mutex: synchronizes PM QoS request and status updates
  * @critical_health_count: count of critical health exceptions
  * @dev_lvl_exception_count: count of device level exceptions since last reset
  * @dev_lvl_exception_id: vendor specific information about the
@@ -1127,7 +1103,6 @@ struct ufs_hba {
 	bool lsdb_sup;
 	bool mcq_enabled;
 	bool mcq_esi_enabled;
-	struct ufshcd_res_info res[RES_MAX];
 	void __iomem *mcq_base;
 	struct ufs_hw_queue *uhq;
 	struct ufs_hw_queue *dev_cmd_queue;
@@ -1136,6 +1111,8 @@ struct ufs_hba {
 	struct delayed_work ufs_rtc_update_work;
 	struct pm_qos_request pm_qos_req;
 	bool pm_qos_enabled;
+	/* synchronizes PM QoS request and status updates */
+	struct mutex pm_qos_mutex;
 
 	int critical_health_count;
 	atomic_t dev_lvl_exception_count;
@@ -1318,6 +1295,7 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg)
 
 void ufshcd_enable_irq(struct ufs_hba *hba);
 void ufshcd_disable_irq(struct ufs_hba *hba);
+void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs);
 int ufshcd_alloc_host(struct device *, struct ufs_hba **);
 int ufshcd_hba_enable(struct ufs_hba *hba);
 int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int);
@@ -1508,5 +1486,6 @@ int __ufshcd_write_ee_control(struct ufs_hba *hba, u32 ee_ctrl_mask);
 int ufshcd_write_ee_control(struct ufs_hba *hba);
 int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask,
 			     const u16 *other_mask, u16 set, u16 clr);
+void ufshcd_force_error_recovery(struct ufs_hba *hba);
 
 #endif /* End of Header */
diff --git a/kernel/acct.c b/kernel/acct.c
index 6520baa13669..61630110e29d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -44,19 +44,14 @@
  * a struct file opened for write. Fixed. 2/6/2000, AV.
  */
 
-#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
-#include <linux/file.h>
 #include <linux/tty.h>
-#include <linux/security.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
 #include <linux/jiffies.h>
-#include <linux/times.h>
 #include <linux/syscalls.h>
-#include <linux/mount.h>
-#include <linux/uaccess.h>
+#include <linux/namei.h>
 #include <linux/sched/cputime.h>
 
 #include <asm/div64.h>
@@ -217,84 +212,70 @@ static void close_work(struct work_struct *work)
 	complete(&acct->done);
 }
 
-static int acct_on(struct filename *pathname)
+DEFINE_FREE(fput_sync, struct file *, if (!IS_ERR_OR_NULL(_T)) __fput_sync(_T))
+static int acct_on(const char __user *name)
 {
-	struct file *file;
-	struct vfsmount *mnt, *internal;
+	/* Difference from BSD - they don't do O_APPEND */
+	const int open_flags = O_WRONLY|O_APPEND|O_LARGEFILE;
 	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct filename *pathname __free(putname) = getname(name);
+	struct file *original_file __free(fput) = NULL;	// in that order
+	struct path internal __free(path_put) = {};	// in that order
+	struct file *file __free(fput_sync) = NULL;	// in that order
 	struct bsd_acct_struct *acct;
+	struct vfsmount *mnt;
 	struct fs_pin *old;
-	int err;
 
-	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
-	if (!acct)
-		return -ENOMEM;
+	if (IS_ERR(pathname))
+		return PTR_ERR(pathname);
+	original_file = file_open_name(pathname, open_flags, 0);
+	if (IS_ERR(original_file))
+		return PTR_ERR(original_file);
 
-	/* Difference from BSD - they don't do O_APPEND */
-	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
-	if (IS_ERR(file)) {
-		kfree(acct);
+	mnt = mnt_clone_internal(&original_file->f_path);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	internal.mnt = mnt;
+	internal.dentry = dget(mnt->mnt_root);
+
+	file = dentry_open(&internal, open_flags, current_cred());
+	if (IS_ERR(file))
 		return PTR_ERR(file);
-	}
 
-	if (!S_ISREG(file_inode(file)->i_mode)) {
-		kfree(acct);
-		filp_close(file, NULL);
+	if (!S_ISREG(file_inode(file)->i_mode))
 		return -EACCES;
-	}
 
 	/* Exclude kernel kernel internal filesystems. */
-	if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
-		kfree(acct);
-		filp_close(file, NULL);
+	if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT))
 		return -EINVAL;
-	}
 
 	/* Exclude procfs and sysfs. */
-	if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
-		kfree(acct);
-		filp_close(file, NULL);
+	if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE)
 		return -EINVAL;
-	}
 
-	if (!(file->f_mode & FMODE_CAN_WRITE)) {
-		kfree(acct);
-		filp_close(file, NULL);
+	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EIO;
-	}
-	internal = mnt_clone_internal(&file->f_path);
-	if (IS_ERR(internal)) {
-		kfree(acct);
-		filp_close(file, NULL);
-		return PTR_ERR(internal);
-	}
-	err = mnt_get_write_access(internal);
-	if (err) {
-		mntput(internal);
-		kfree(acct);
-		filp_close(file, NULL);
-		return err;
-	}
-	mnt = file->f_path.mnt;
-	file->f_path.mnt = internal;
+
+	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+	if (!acct)
+		return -ENOMEM;
 
 	atomic_long_set(&acct->count, 1);
 	init_fs_pin(&acct->pin, acct_pin_kill);
-	acct->file = file;
+	acct->file = no_free_ptr(file);
 	acct->needcheck = jiffies;
 	acct->ns = ns;
 	mutex_init(&acct->lock);
 	INIT_WORK(&acct->work, close_work);
 	init_completion(&acct->done);
 	mutex_lock_nested(&acct->lock, 1);	/* nobody has seen it yet */
-	pin_insert(&acct->pin, mnt);
+	pin_insert(&acct->pin, original_file->f_path.mnt);
 
 	rcu_read_lock();
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	mnt_put_write_access(mnt);
-	mntput(mnt);
 	return 0;
 }
 
@@ -319,14 +300,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 		return -EPERM;
 
 	if (name) {
-		struct filename *tmp = getname(name);
-
-		if (IS_ERR(tmp))
-			return PTR_ERR(tmp);
 		mutex_lock(&acct_on_mutex);
-		error = acct_on(tmp);
+		error = acct_on(name);
 		mutex_unlock(&acct_on_mutex);
-		putname(tmp);
 	} else {
 		rcu_read_lock();
 		pin_kill(task_active_pid_ns(current)->bacct);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 73bba397672a..ff40e5e65c43 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15645,7 +15645,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check dest operand */
-		if (opcode == BPF_NEG) {
+		if (opcode == BPF_NEG &&
+		    regs[insn->dst_reg].type == SCALAR_VALUE) {
 			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
 			err = err ?: adjust_scalar_min_max_vals(env, insn,
 							 &regs[insn->dst_reg],
@@ -15803,7 +15804,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
-			if (insn->imm != 0 || insn->off > 1 ||
+			if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) ||
 			    (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
 				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
@@ -15813,7 +15814,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (err)
 				return err;
 		} else {
-			if (insn->src_reg != BPF_REG_0 || insn->off > 1 ||
+			if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) ||
 			    (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) {
 				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index b82399437db0..1e5c64cb6a42 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -38,8 +38,8 @@ enum {
 	dma_debug_single,
 	dma_debug_sg,
 	dma_debug_coherent,
-	dma_debug_resource,
 	dma_debug_noncoherent,
+	dma_debug_phy,
 };
 
 enum map_err_types {
@@ -141,8 +141,8 @@ static const char *type2name[] = {
 	[dma_debug_single] = "single",
 	[dma_debug_sg] = "scatter-gather",
 	[dma_debug_coherent] = "coherent",
-	[dma_debug_resource] = "resource",
 	[dma_debug_noncoherent] = "noncoherent",
+	[dma_debug_phy] = "phy",
 };
 
 static const char *dir2name[] = {
@@ -1054,17 +1054,16 @@ static void check_unmap(struct dma_debug_entry *ref)
 	dma_entry_free(entry);
 }
 
-static void check_for_stack(struct device *dev,
-			    struct page *page, size_t offset)
+static void check_for_stack(struct device *dev, phys_addr_t phys)
 {
 	void *addr;
 	struct vm_struct *stack_vm_area = task_stack_vm_area(current);
 
 	if (!stack_vm_area) {
 		/* Stack is direct-mapped. */
-		if (PageHighMem(page))
+		if (PhysHighMem(phys))
 			return;
-		addr = page_address(page) + offset;
+		addr = phys_to_virt(phys);
 		if (object_is_on_stack(addr))
 			err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr);
 	} else {
@@ -1072,10 +1071,12 @@ static void check_for_stack(struct device *dev,
 		int i;
 
 		for (i = 0; i < stack_vm_area->nr_pages; i++) {
-			if (page != stack_vm_area->pages[i])
+			if (__phys_to_pfn(phys) !=
+			    page_to_pfn(stack_vm_area->pages[i]))
 				continue;
 
-			addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+			addr = (u8 *)current->stack + i * PAGE_SIZE +
+			       (phys % PAGE_SIZE);
 			err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr);
 			break;
 		}
@@ -1204,9 +1205,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
 }
 EXPORT_SYMBOL(debug_dma_map_single);
 
-void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
-			size_t size, int direction, dma_addr_t dma_addr,
-			unsigned long attrs)
+void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+		int direction, dma_addr_t dma_addr, unsigned long attrs)
 {
 	struct dma_debug_entry *entry;
 
@@ -1221,19 +1221,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
 		return;
 
 	entry->dev       = dev;
-	entry->type      = dma_debug_single;
-	entry->paddr	 = page_to_phys(page) + offset;
+	entry->type      = dma_debug_phy;
+	entry->paddr	 = phys;
 	entry->dev_addr  = dma_addr;
 	entry->size      = size;
 	entry->direction = direction;
 	entry->map_err_type = MAP_ERR_NOT_CHECKED;
 
-	check_for_stack(dev, page, offset);
+	if (!(attrs & DMA_ATTR_MMIO)) {
+		check_for_stack(dev, phys);
 
-	if (!PageHighMem(page)) {
-		void *addr = page_address(page) + offset;
-
-		check_for_illegal_area(dev, addr, size);
+		if (!PhysHighMem(phys))
+			check_for_illegal_area(dev, phys_to_virt(phys), size);
 	}
 
 	add_dma_entry(entry, attrs);
@@ -1277,11 +1276,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 }
 EXPORT_SYMBOL(debug_dma_mapping_error);
 
-void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
+void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr,
 			  size_t size, int direction)
 {
 	struct dma_debug_entry ref = {
-		.type           = dma_debug_single,
+		.type           = dma_debug_phy,
 		.dev            = dev,
 		.dev_addr       = dma_addr,
 		.size           = size,
@@ -1305,7 +1304,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nents, i) {
-		check_for_stack(dev, sg_page(s), s->offset);
+		check_for_stack(dev, sg_phys(s));
 		if (!PageHighMem(sg_page(s)))
 			check_for_illegal_area(dev, sg_virt(s), s->length);
 	}
@@ -1445,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
 	check_unmap(&ref);
 }
 
-void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
-			    int direction, dma_addr_t dma_addr,
-			    unsigned long attrs)
-{
-	struct dma_debug_entry *entry;
-
-	if (unlikely(dma_debug_disabled()))
-		return;
-
-	entry = dma_entry_alloc();
-	if (!entry)
-		return;
-
-	entry->type		= dma_debug_resource;
-	entry->dev		= dev;
-	entry->paddr		= addr;
-	entry->size		= size;
-	entry->dev_addr		= dma_addr;
-	entry->direction	= direction;
-	entry->map_err_type	= MAP_ERR_NOT_CHECKED;
-
-	add_dma_entry(entry, attrs);
-}
-
-void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
-			      size_t size, int direction)
-{
-	struct dma_debug_entry ref = {
-		.type           = dma_debug_resource,
-		.dev            = dev,
-		.dev_addr       = dma_addr,
-		.size           = size,
-		.direction      = direction,
-	};
-
-	if (unlikely(dma_debug_disabled()))
-		return;
-
-	check_unmap(&ref);
-}
-
 void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 				   size_t size, int direction)
 {
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 48757ca13f31..da7be0bddcf6 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -9,12 +9,11 @@
 #define _KERNEL_DMA_DEBUG_H
 
 #ifdef CONFIG_DMA_API_DEBUG
-extern void debug_dma_map_page(struct device *dev, struct page *page,
-			       size_t offset, size_t size,
-			       int direction, dma_addr_t dma_addr,
+extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+			       size_t size, int direction, dma_addr_t dma_addr,
 			       unsigned long attrs);
 
-extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
 				 size_t size, int direction);
 
 extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
@@ -31,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
 extern void debug_dma_free_coherent(struct device *dev, size_t size,
 				    void *virt, dma_addr_t addr);
 
-extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
-				   size_t size, int direction,
-				   dma_addr_t dma_addr,
-				   unsigned long attrs);
-
-extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
-				     size_t size, int direction);
-
 extern void debug_dma_sync_single_for_cpu(struct device *dev,
 					  dma_addr_t dma_handle, size_t size,
 					  int direction);
@@ -62,14 +53,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page,
 				 size_t size, int direction,
 				 dma_addr_t dma_addr);
 #else /* CONFIG_DMA_API_DEBUG */
-static inline void debug_dma_map_page(struct device *dev, struct page *page,
-				      size_t offset, size_t size,
-				      int direction, dma_addr_t dma_addr,
-				      unsigned long attrs)
+static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys,
+				      size_t size, int direction,
+				      dma_addr_t dma_addr, unsigned long attrs)
 {
 }
 
-static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr,
 					size_t size, int direction)
 {
 }
@@ -97,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
 {
 }
 
-static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
-					  size_t size, int direction,
-					  dma_addr_t dma_addr,
-					  unsigned long attrs)
-{
-}
-
-static inline void debug_dma_unmap_resource(struct device *dev,
-					    dma_addr_t dma_addr, size_t size,
-					    int direction)
-{
-}
-
 static inline void debug_dma_sync_single_for_cpu(struct device *dev,
 						 dma_addr_t dma_handle,
 						 size_t size, int direction)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 24c359d9c879..1f9ee9759426 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -120,7 +120,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 		gfp_t gfp, bool allow_highmem)
 {
 	int node = dev_to_node(dev);
-	struct page *page = NULL;
+	struct page *page;
 	u64 phys_limit;
 
 	WARN_ON_ONCE(!PAGE_ALIGNED(size));
@@ -131,30 +131,25 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
 	page = dma_alloc_contiguous(dev, size, gfp);
 	if (page) {
-		if (!dma_coherent_ok(dev, page_to_phys(page), size) ||
-		    (!allow_highmem && PageHighMem(page))) {
-			dma_free_contiguous(dev, page, size);
-			page = NULL;
-		}
+		if (dma_coherent_ok(dev, page_to_phys(page), size) &&
+		    (allow_highmem || !PageHighMem(page)))
+			return page;
+
+		dma_free_contiguous(dev, page, size);
 	}
-again:
-	if (!page)
-		page = alloc_pages_node(node, gfp, get_order(size));
-	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+
+	while ((page = alloc_pages_node(node, gfp, get_order(size)))
+	       && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		__free_pages(page, get_order(size));
-		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
 		    phys_limit < DMA_BIT_MASK(64) &&
-		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
+		    !(gfp & (GFP_DMA32 | GFP_DMA)))
 			gfp |= GFP_DMA32;
-			goto again;
-		}
-
-		if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
+		else if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA))
 			gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
-			goto again;
-		}
+		else
+			return NULL;
 	}
 
 	return page;
@@ -453,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
 		if (sg_dma_is_bus_address(sg))
 			sg_dma_unmark_bus_address(sg);
 		else
-			dma_direct_unmap_page(dev, sg->dma_address,
+			dma_direct_unmap_phys(dev, sg->dma_address,
 					      sg_dma_len(sg), dir, attrs);
 	}
 }
@@ -476,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
 			 */
 			break;
 		case PCI_P2PDMA_MAP_NONE:
-			sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
-					sg->offset, sg->length, dir, attrs);
+			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
+					sg->length, dir, attrs);
 			if (sg->dma_address == DMA_MAPPING_ERROR) {
 				ret = -EIO;
 				goto out_unmap;
@@ -502,22 +497,6 @@ out_unmap:
 	return ret;
 }
 
-dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-		size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-	dma_addr_t dma_addr = paddr;
-
-	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
-		dev_err_once(dev,
-			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
-		WARN_ON_ONCE(1);
-		return DMA_MAPPING_ERROR;
-	}
-
-	return dma_addr;
-}
-
 int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
 		void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		unsigned long attrs)
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index d2c0b7e632fc..da2fadf45bcd 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
 		arch_dma_mark_clean(paddr, size);
 }
 
-static inline dma_addr_t dma_direct_map_page(struct device *dev,
-		struct page *page, unsigned long offset, size_t size,
-		enum dma_data_direction dir, unsigned long attrs)
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+		phys_addr_t phys, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
 {
-	phys_addr_t phys = page_to_phys(page) + offset;
-	dma_addr_t dma_addr = phys_to_dma(dev, phys);
+	dma_addr_t dma_addr;
 
 	if (is_swiotlb_force_bounce(dev)) {
-		if (is_pci_p2pdma_page(page))
-			return DMA_MAPPING_ERROR;
+		if (attrs & DMA_ATTR_MMIO)
+			goto err_overflow;
+
 		return swiotlb_map(dev, phys, size, dir, attrs);
 	}
 
-	if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
-	    dma_kmalloc_needs_bounce(dev, size, dir)) {
-		if (is_pci_p2pdma_page(page))
-			return DMA_MAPPING_ERROR;
-		if (is_swiotlb_active(dev))
-			return swiotlb_map(dev, phys, size, dir, attrs);
-
-		dev_WARN_ONCE(dev, 1,
-			     "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
-			     &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
-		return DMA_MAPPING_ERROR;
+	if (attrs & DMA_ATTR_MMIO) {
+		dma_addr = phys;
+		if (unlikely(!dma_capable(dev, dma_addr, size, false)))
+			goto err_overflow;
+	} else {
+		dma_addr = phys_to_dma(dev, phys);
+		if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
+		    dma_kmalloc_needs_bounce(dev, size, dir)) {
+			if (is_swiotlb_active(dev))
+				return swiotlb_map(dev, phys, size, dir, attrs);
+
+			goto err_overflow;
+		}
 	}
 
-	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+	if (!dev_is_dma_coherent(dev) &&
+	    !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
 		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
+
+err_overflow:
+	dev_WARN_ONCE(
+		dev, 1,
+		"DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
+		&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
+	return DMA_MAPPING_ERROR;
 }
 
-static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
+static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	phys_addr_t phys = dma_to_phys(dev, addr);
+	phys_addr_t phys;
+
+	if (attrs & DMA_ATTR_MMIO)
+		/* nothing to do: uncached and no swiotlb */
+		return;
 
+	phys = dma_to_phys(dev, addr);
 	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
 		dma_direct_sync_single_for_cpu(dev, addr, size, dir);
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 56de28a3b179..fe7472f13b10 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -152,11 +152,11 @@ static inline bool dma_map_direct(struct device *dev,
 	return dma_go_direct(dev, *dev->dma_mask, ops);
 }
 
-dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
-		size_t offset, size_t size, enum dma_data_direction dir,
-		unsigned long attrs)
+dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
+	bool is_mmio = attrs & DMA_ATTR_MMIO;
 	dma_addr_t addr;
 
 	BUG_ON(!valid_dma_direction(dir));
@@ -165,36 +165,81 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 		return DMA_MAPPING_ERROR;
 
 	if (dma_map_direct(dev, ops) ||
-	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
-		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
+	    (!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
+		addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
 	else if (use_dma_iommu(dev))
-		addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs);
-	else
+		addr = iommu_dma_map_phys(dev, phys, size, dir, attrs);
+	else if (is_mmio) {
+		if (!ops->map_resource)
+			return DMA_MAPPING_ERROR;
+
+		addr = ops->map_resource(dev, phys, size, dir, attrs);
+	} else {
+		struct page *page = phys_to_page(phys);
+		size_t offset = offset_in_page(phys);
+
+		/*
+		 * The dma_ops API contract for ops->map_page() requires
+		 * kmappable memory, while ops->map_resource() does not.
+		 */
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
-	kmsan_handle_dma(page, offset, size, dir);
-	trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir,
-			   attrs);
-	debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
+	}
+
+	if (!is_mmio)
+		kmsan_handle_dma(phys, size, dir);
+	trace_dma_map_phys(dev, phys, addr, size, dir, attrs);
+	debug_dma_map_phys(dev, phys, size, dir, addr, attrs);
 
 	return addr;
 }
+EXPORT_SYMBOL_GPL(dma_map_phys);
+
+dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
+		size_t offset, size_t size, enum dma_data_direction dir,
+		unsigned long attrs)
+{
+	phys_addr_t phys = page_to_phys(page) + offset;
+
+	if (unlikely(attrs & DMA_ATTR_MMIO))
+		return DMA_MAPPING_ERROR;
+
+	if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+	    WARN_ON_ONCE(is_zone_device_page(page)))
+		return DMA_MAPPING_ERROR;
+
+	return dma_map_phys(dev, phys, size, dir, attrs);
+}
 EXPORT_SYMBOL(dma_map_page_attrs);
 
-void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
+	bool is_mmio = attrs & DMA_ATTR_MMIO;
 
 	BUG_ON(!valid_dma_direction(dir));
 	if (dma_map_direct(dev, ops) ||
-	    arch_dma_unmap_page_direct(dev, addr + size))
-		dma_direct_unmap_page(dev, addr, size, dir, attrs);
+	    (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size)))
+		dma_direct_unmap_phys(dev, addr, size, dir, attrs);
 	else if (use_dma_iommu(dev))
-		iommu_dma_unmap_page(dev, addr, size, dir, attrs);
-	else
+		iommu_dma_unmap_phys(dev, addr, size, dir, attrs);
+	else if (is_mmio) {
+		if (ops->unmap_resource)
+			ops->unmap_resource(dev, addr, size, dir, attrs);
+	} else
 		ops->unmap_page(dev, addr, size, dir, attrs);
-	trace_dma_unmap_page(dev, addr, size, dir, attrs);
-	debug_dma_unmap_page(dev, addr, size, dir);
+	trace_dma_unmap_phys(dev, addr, size, dir, attrs);
+	debug_dma_unmap_phys(dev, addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(dma_unmap_phys);
+
+void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
+		 enum dma_data_direction dir, unsigned long attrs)
+{
+	if (unlikely(attrs & DMA_ATTR_MMIO))
+		return;
+
+	dma_unmap_phys(dev, addr, size, dir, attrs);
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
@@ -321,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-	dma_addr_t addr = DMA_MAPPING_ERROR;
-
-	BUG_ON(!valid_dma_direction(dir));
-
-	if (WARN_ON_ONCE(!dev->dma_mask))
+	if (IS_ENABLED(CONFIG_DMA_API_DEBUG) &&
+	    WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
 		return DMA_MAPPING_ERROR;
 
-	if (dma_map_direct(dev, ops))
-		addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
-	else if (use_dma_iommu(dev))
-		addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs);
-	else if (ops->map_resource)
-		addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
-
-	trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs);
-	debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
-	return addr;
+	return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO);
 }
 EXPORT_SYMBOL(dma_map_resource);
 
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs)
 {
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	BUG_ON(!valid_dma_direction(dir));
-	if (dma_map_direct(dev, ops))
-		; /* nothing to do: uncached and no swiotlb */
-	else if (use_dma_iommu(dev))
-		iommu_dma_unmap_resource(dev, addr, size, dir, attrs);
-	else if (ops->unmap_resource)
-		ops->unmap_resource(dev, addr, size, dir, attrs);
-	trace_dma_unmap_resource(dev, addr, size, dir, attrs);
-	debug_dma_unmap_resource(dev, addr, size, dir);
+	dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO);
 }
 EXPORT_SYMBOL(dma_unmap_resource);
 
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index 9afd569eadb9..6f9d604d9d40 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size,
 		return NULL;
 
 	if (use_dma_iommu(dev))
-		*dma_handle = iommu_dma_map_page(dev, page, 0, size, dir,
-						 DMA_ATTR_SKIP_CPU_SYNC);
+		*dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size,
+						 dir, DMA_ATTR_SKIP_CPU_SYNC);
 	else
 		*dma_handle = ops->map_page(dev, page, 0, size, dir,
 					    DMA_ATTR_SKIP_CPU_SYNC);
@@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (use_dma_iommu(dev))
-		iommu_dma_unmap_page(dev, dma_handle, size, dir,
+		iommu_dma_unmap_phys(dev, dma_handle, size, dir,
 				     DMA_ATTR_SKIP_CPU_SYNC);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, dma_handle, size, dir,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index abcf3fa63a56..0d37da3d95b6 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1209,7 +1209,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
 	nslabs = nr_slots(alloc_size);
 	phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
 	pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
-				  GFP_NOWAIT | __GFP_NOWARN);
+				  GFP_NOWAIT);
 	if (!pool)
 		return -1;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 8f23f5273bab..4f87c16d915a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -899,7 +899,7 @@ const struct bpf_func_proto bpf_send_signal_thread_proto = {
 	.arg1_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz)
+BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz)
 {
 	struct path copy;
 	long len;
diff --git a/mm/filemap.c b/mm/filemap.c
index a52dd38d2b4a..13f0259d993c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1621,7 +1621,7 @@ static void filemap_end_dropbehind(struct folio *folio)
  * completes. Do that now. If we fail, it's likely because of a big folio -
  * just reset dropbehind for that case and latter completions should invalidate.
  */
-static void filemap_end_dropbehind_write(struct folio *folio)
+void folio_end_dropbehind(struct folio *folio)
 {
 	if (!folio_test_dropbehind(folio))
 		return;
@@ -1638,16 +1638,18 @@ static void filemap_end_dropbehind_write(struct folio *folio)
 		folio_unlock(folio);
 	}
 }
+EXPORT_SYMBOL_GPL(folio_end_dropbehind);
 
 /**
- * folio_end_writeback - End writeback against a folio.
+ * folio_end_writeback_no_dropbehind - End writeback against a folio.
  * @folio: The folio.
  *
  * The folio must actually be under writeback.
+ * This call is intended for filesystems that need to defer dropbehind.
  *
  * Context: May be called from process or interrupt context.
  */
-void folio_end_writeback(struct folio *folio)
+void folio_end_writeback_no_dropbehind(struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
 
@@ -1663,6 +1665,25 @@ void folio_end_writeback(struct folio *folio)
 		folio_rotate_reclaimable(folio);
 	}
 
+	if (__folio_end_writeback(folio))
+		folio_wake_bit(folio, PG_writeback);
+
+	acct_reclaim_writeback(folio);
+}
+EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind);
+
+/**
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
+ *
+ * The folio must actually be under writeback.
+ *
+ * Context: May be called from process or interrupt context.
+ */
+void folio_end_writeback(struct folio *folio)
+{
+	VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
+
 	/*
 	 * Writeback does not hold a folio reference of its own, relying
 	 * on truncation to wait for the clearing of PG_writeback.
@@ -1670,11 +1691,8 @@ void folio_end_writeback(struct folio *folio)
 	 * reused before the folio_wake_bit().
 	 */
 	folio_get(folio);
-	if (__folio_end_writeback(folio))
-		folio_wake_bit(folio, PG_writeback);
-
-	filemap_end_dropbehind_write(folio);
-	acct_reclaim_writeback(folio);
+	folio_end_writeback_no_dropbehind(folio);
+	folio_end_dropbehind(folio);
 	folio_put(folio);
 }
 EXPORT_SYMBOL(folio_end_writeback);
diff --git a/mm/hmm.c b/mm/hmm.c
index 3e00f08722d5..87562914670a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -806,7 +806,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
 	case PCI_P2PDMA_MAP_NONE:
 		break;
 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-		attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+		attrs |= DMA_ATTR_MMIO;
 		pfns[idx] |= HMM_PFN_P2PDMA;
 		break;
 	case PCI_P2PDMA_MAP_BUS_ADDR:
@@ -835,8 +835,8 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
 		if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
 			goto error;
 
-		dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
-					DMA_BIDIRECTIONAL);
+		dma_addr = dma_map_phys(dev, paddr, map->dma_entry_size,
+					DMA_BIDIRECTIONAL, attrs);
 		if (dma_mapping_error(dev, dma_addr))
 			goto error;
 
@@ -871,16 +871,17 @@ bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
 	if ((pfns[idx] & valid_dma) != valid_dma)
 		return false;
 
+	if (pfns[idx] & HMM_PFN_P2PDMA)
+		attrs |= DMA_ATTR_MMIO;
+
 	if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
 		; /* no need to unmap bus address P2P mappings */
-	else if (dma_use_iova(state)) {
-		if (pfns[idx] & HMM_PFN_P2PDMA)
-			attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+	else if (dma_use_iova(state))
 		dma_iova_unlink(dev, state, idx * map->dma_entry_size,
 				map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
-	} else if (dma_need_unmap(dev))
-		dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
-			       DMA_BIDIRECTIONAL);
+	else if (dma_need_unmap(dev))
+		dma_unmap_phys(dev, dma_addrs[idx], map->dma_entry_size,
+			       DMA_BIDIRECTIONAL, attrs);
 
 	pfns[idx] &=
 		~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 97de3d6194f0..90bee565b9bc 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -336,14 +336,16 @@ static void kmsan_handle_dma_page(const void *addr, size_t size,
 }
 
 /* Helper function to handle DMA data transfers. */
-void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+void kmsan_handle_dma(phys_addr_t phys, size_t size,
 		      enum dma_data_direction dir)
 {
-	u64 page_offset, to_go, addr;
+	struct page *page = phys_to_page(phys);
+	u64 page_offset, to_go;
+	void *addr;
 
-	if (PageHighMem(page))
+	if (PhysHighMem(phys))
 		return;
-	addr = (u64)page_address(page) + offset;
+	addr = page_to_virt(page);
 	/*
 	 * The kernel may occasionally give us adjacent DMA pages not belonging
 	 * to the same allocation. Process them separately to avoid triggering
@@ -366,8 +368,7 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
 	int i;
 
 	for_each_sg(sg, item, nents, i)
-		kmsan_handle_dma(sg_page(item), item->offset, item->length,
-				 dir);
+		kmsan_handle_dma(sg_phys(item), item->length, dir);
 }
 
 /* Functions from kmsan-checks.h follow. */
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 2d8b67dac7b5..a570e7adf270 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -101,6 +101,20 @@ config SUNRPC_DEBUG
 
 	  If unsure, say Y.
 
+config SUNRPC_DEBUG_TRACE
+	bool "RPC: Send dfprintk() output to the trace buffer"
+	depends on SUNRPC_DEBUG && TRACING
+	default n
+	help
+          dprintk() output can be voluminous, which can overwhelm the
+          kernel's logging facility as it must be sent to the console.
+          This option causes dprintk() output to go to the trace buffer
+          instead of the kernel log.
+
+          This will cause warnings about trace_printk() being used to be
+          logged at boot time, so say N unless you are debugging a problem
+          with sunrpc-based clients or services.
+
 config SUNRPC_XPRT_RDMA
 	tristate "RPC-over-RDMA transport"
 	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index cb32ab9a8395..7d2cdc2bd374 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 	struct gssx_res_accept_sec_context *res = data;
 	u32 value_follows;
 	int err;
-	struct page *scratch;
+	struct folio *scratch;
 
-	scratch = alloc_page(GFP_KERNEL);
+	scratch = folio_alloc(GFP_KERNEL, 0);
 	if (!scratch)
 		return -ENOMEM;
-	xdr_set_scratch_page(xdr, scratch);
+	xdr_set_scratch_folio(xdr, scratch);
 
 	/* res->status */
 	err = gssx_dec_status(xdr, &res->status);
@@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
 	err = gssx_dec_option_array(xdr, &res->options);
 
 out_free:
-	__free_page(scratch);
+	folio_put(scratch);
 	return err;
 }
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9b45fbdc90ca..016f16ca5779 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1074,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task)
 	rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
  * rpc_free - free RPC buffer resources allocated via rpc_malloc
@@ -1095,7 +1094,6 @@ void rpc_free(struct rpc_task *task)
 	else
 		kfree(buf);
 }
-EXPORT_SYMBOL_GPL(rpc_free);
 
 /*
  * Creation and deletion of RPC task structures
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 4e92e2a50168..d8d8842c7de5 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -86,7 +86,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc)
 		/* ACL likes to be lazy in allocating pages - ACLs
 		 * are small by default but can get huge. */
 		if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) {
-			*ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+			*ppage = alloc_page(GFP_NOWAIT);
 			if (unlikely(*ppage == NULL)) {
 				if (copied == 0)
 					return -ENOMEM;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index b1fab3a69544..de05ef637bdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx)
 		if (m->mode == SVC_POOL_PERNODE)
 			return m->pool_to[pidx];
 	}
-	return NUMA_NO_NODE;
+	return numa_mem_id();
 }
 /*
  * Set the given thread's cpus_allowed mask so that it
@@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net)
 	svc_unregister(serv, net);
 	rpcb_put_local(net);
 }
-EXPORT_SYMBOL_GPL(svc_rpcb_cleanup);
 
 static int svc_uses_rpcbind(struct svc_serv *serv)
 {
@@ -670,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp)
 	folio_batch_release(&rqstp->rq_fbatch);
 	kfree(rqstp->rq_bvec);
 	svc_release_buffer(rqstp);
-	if (rqstp->rq_scratch_page)
-		put_page(rqstp->rq_scratch_page);
+	if (rqstp->rq_scratch_folio)
+		folio_put(rqstp->rq_scratch_folio);
 	kfree(rqstp->rq_resp);
 	kfree(rqstp->rq_argp);
 	kfree(rqstp->rq_auth_data);
@@ -692,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 	rqstp->rq_server = serv;
 	rqstp->rq_pool = pool;
 
-	rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
-	if (!rqstp->rq_scratch_page)
+	rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node);
+	if (!rqstp->rq_scratch_folio)
 		goto out_enomem;
 
 	rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 8b1837228799..049ab53088e9 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1102,6 +1102,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
  * svc_xprt_destroy_all - Destroy transports associated with @serv
  * @serv: RPC service to be shut down
  * @net: target network namespace
+ * @unregister: true if it is OK to unregister the destroyed xprts
  *
  * Server threads may still be running (especially in the case where the
  * service is still running in other network namespaces).
@@ -1114,7 +1115,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
  * threads, we may need to wait a little while and then check again to
  * see if they're done.
  */
-void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
+void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net,
+			  bool unregister)
 {
 	int delay = 0;
 
@@ -1124,6 +1126,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
 		svc_clean_up_xprts(serv, net);
 		msleep(delay++);
 	}
+
+	if (unregister)
+		svc_rpcb_cleanup(serv, net);
 }
 EXPORT_SYMBOL_GPL(svc_xprt_destroy_all);
 
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 1478c41c7e9d..3aac1456e23e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
 	ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
 	while (len > 0) {
 		if (!*ppages)
-			*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+			*ppages = alloc_page(GFP_NOWAIT);
 		if (!*ppages)
 			return -ENOBUFS;
 		ppages++;
diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs
index 2569c21208e3..4e0af3e1a3b9 100644
--- a/rust/kernel/dma.rs
+++ b/rust/kernel/dma.rs
@@ -252,6 +252,9 @@ pub mod attrs {
     /// Indicates that the buffer is fully accessible at an elevated privilege level (and
     /// ideally inaccessible or at least read-only at lesser-privileged levels).
     pub const DMA_ATTR_PRIVILEGED: Attrs = Attrs(bindings::DMA_ATTR_PRIVILEGED);
+
+    /// Indicates that the buffer is MMIO memory.
+    pub const DMA_ATTR_MMIO: Attrs = Attrs(bindings::DMA_ATTR_MMIO);
 }
 
 /// DMA data direction.
diff --git a/scripts/kernel-doc.py b/scripts/kernel-doc.py
index fc3d46ef519f..d9fe2bcbd39c 100755
--- a/scripts/kernel-doc.py
+++ b/scripts/kernel-doc.py
@@ -2,8 +2,17 @@
 # SPDX-License-Identifier: GPL-2.0
 # Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
 #
-# pylint: disable=C0103,R0915
-#
+# pylint: disable=C0103,R0912,R0914,R0915
+
+# NOTE: While kernel-doc requires at least version 3.6 to run, the
+#       command line should work with Python 3.2+ (tested with 3.4).
+#       The rationale is that it shall fail gracefully during Kernel
+#       compilation with older Kernel versions. Due to that:
+#       - encoding line is needed here;
+#       - no f-strings can be used on this file.
+#       - the libraries that require newer versions can only be included
+#         after Python version is checked.
+
 # Converted from the kernel-doc script originally written in Perl
 # under GPLv2, copyrighted since 1998 by the following authors:
 #
@@ -107,9 +116,6 @@ SRC_DIR = os.path.dirname(os.path.realpath(__file__))
 
 sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
 
-from kdoc_files import KernelFiles                      # pylint: disable=C0413
-from kdoc_output import RestFormat, ManFormat           # pylint: disable=C0413
-
 DESC = """
 Read C language source or header FILEs, extract embedded documentation comments,
 and print formatted documentation to standard output.
@@ -273,14 +279,22 @@ def main():
 
     python_ver = sys.version_info[:2]
     if python_ver < (3,6):
-        logger.warning("Python 3.6 or later is required by kernel-doc")
+        # Depending on Kernel configuration, kernel-doc --none is called at
+        # build time. As we don't want to break compilation due to the
+        # usage of an old Python version, return 0 here.
+        if args.none:
+            logger.error("Python 3.6 or later is required by kernel-doc. skipping checks")
+            sys.exit(0)
 
-        # Return 0 here to avoid breaking compilation
-        sys.exit(0)
+        sys.exit("Python 3.6 or later is required by kernel-doc. Aborting.")
 
     if python_ver < (3,7):
         logger.warning("Python 3.7 or later is required for correct results")
 
+    # Import kernel-doc libraries only after checking Python version
+    from kdoc_files import KernelFiles                  # pylint: disable=C0415
+    from kdoc_output import RestFormat, ManFormat       # pylint: disable=C0415
+
     if args.man:
         out_style = ManFormat(modulename=args.modulename)
     elif args.none:
@@ -308,11 +322,11 @@ def main():
         sys.exit(0)
 
     if args.werror:
-        print(f"{error_count} warnings as errors")
+        print("%s warnings as errors" % error_count)    # pylint: disable=C0209
         sys.exit(error_count)
 
     if args.verbose:
-        print(f"{error_count} errors")
+        print("%s errors" % error_count)                # pylint: disable=C0209
 
     if args.none:
         sys.exit(0)
diff --git a/scripts/lib/kdoc/kdoc_parser.py b/scripts/lib/kdoc/kdoc_parser.py
index fe730099eca8..2376f180b1fa 100644
--- a/scripts/lib/kdoc/kdoc_parser.py
+++ b/scripts/lib/kdoc/kdoc_parser.py
@@ -46,7 +46,7 @@ doc_decl = doc_com + KernRe(r'(\w+)', cache=False)
 known_section_names = 'description|context|returns?|notes?|examples?'
 known_sections = KernRe(known_section_names, flags = re.I)
 doc_sect = doc_com + \
-    KernRe(r'\s*(\@[.\w]+|\@\.\.\.|' + known_section_names + r')\s*:([^:].*)?$',
+    KernRe(r'\s*(@[.\w]+|@\.\.\.|' + known_section_names + r')\s*:([^:].*)?$',
            flags=re.I, cache=False)
 
 doc_content = doc_com_body + KernRe(r'(.*)', cache=False)
@@ -54,13 +54,11 @@ doc_inline_start = KernRe(r'^\s*/\*\*\s*$', cache=False)
 doc_inline_sect = KernRe(r'\s*\*\s*(@\s*[\w][\w\.]*\s*):(.*)', cache=False)
 doc_inline_end = KernRe(r'^\s*\*/\s*$', cache=False)
 doc_inline_oneline = KernRe(r'^\s*/\*\*\s*(@[\w\s]+):\s*(.*)\s*\*/\s*$', cache=False)
-attribute = KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)",
-               flags=re.I | re.S, cache=False)
 
 export_symbol = KernRe(r'^\s*EXPORT_SYMBOL(_GPL)?\s*\(\s*(\w+)\s*\)\s*', cache=False)
 export_symbol_ns = KernRe(r'^\s*EXPORT_SYMBOL_NS(_GPL)?\s*\(\s*(\w+)\s*,\s*"\S+"\)\s*', cache=False)
 
-type_param = KernRe(r"\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False)
+type_param = KernRe(r"@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)", cache=False)
 
 #
 # Tests for the beginning of a kerneldoc block in its various forms.
@@ -75,12 +73,154 @@ doc_begin_func = KernRe(str(doc_com) +			# initial " * '
                         cache = False)
 
 #
+# Here begins a long set of transformations to turn structure member prefixes
+# and macro invocations into something we can parse and generate kdoc for.
+#
+struct_args_pattern = r'([^,)]+)'
+
+struct_xforms = [
+    # Strip attributes
+    (KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)", flags=re.I | re.S, cache=False), ' '),
+    (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
+    (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
+    (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
+    (KernRe(r'\s*__packed\s*', re.S), ' '),
+    (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
+    (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
+    (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
+    (KernRe(r'\s*__cacheline_group_(begin|end)\([^\)]+\);'), ''),
+    #
+    # Unwrap struct_group macros based on this definition:
+    # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
+    # which has variants like: struct_group(NAME, MEMBERS...)
+    # Only MEMBERS arguments require documentation.
+    #
+    # Parsing them happens on two steps:
+    #
+    # 1. drop struct group arguments that aren't at MEMBERS,
+    #    storing them as STRUCT_GROUP(MEMBERS)
+    #
+    # 2. remove STRUCT_GROUP() ancillary macro.
+    #
+    # The original logic used to remove STRUCT_GROUP() using an
+    # advanced regex:
+    #
+    #   \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
+    #
+    # with two patterns that are incompatible with
+    # Python re module, as it has:
+    #
+    #   - a recursive pattern: (?1)
+    #   - an atomic grouping: (?>...)
+    #
+    # I tried a simpler version: but it didn't work either:
+    #   \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
+    #
+    # As it doesn't properly match the end parenthesis on some cases.
+    #
+    # So, a better solution was crafted: there's now a NestedMatch
+    # class that ensures that delimiters after a search are properly
+    # matched. So, the implementation to drop STRUCT_GROUP() will be
+    # handled in separate.
+    #
+    (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
+    (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
+    (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
+    (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
+    #
+    # Replace macros
+    #
+    # TODO: use NestedMatch for FOO($1, $2, ...) matches
+    #
+    # it is better to also move those to the NestedMatch logic,
+    # to ensure that parenthesis will be properly matched.
+    #
+    (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S),
+     r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
+    (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S),
+     r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
+    (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+            re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
+    (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
+            re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
+    (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern +
+            r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+    (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' +
+            struct_args_pattern + r'\)', re.S), r'\2 *\1'),
+    (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' +
+            struct_args_pattern + r'\)', re.S), r'\1 \2[]'),
+    (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'),
+    (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'),
+]
+#
+# Regexes here are guaranteed to have the end limiter matching
+# the start delimiter. Yet, right now, only one replace group
+# is allowed.
+#
+struct_nested_prefixes = [
+    (re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
+]
+
+#
+# Transforms for function prototypes
+#
+function_xforms  = [
+    (KernRe(r"^static +"), ""),
+    (KernRe(r"^extern +"), ""),
+    (KernRe(r"^asmlinkage +"), ""),
+    (KernRe(r"^inline +"), ""),
+    (KernRe(r"^__inline__ +"), ""),
+    (KernRe(r"^__inline +"), ""),
+    (KernRe(r"^__always_inline +"), ""),
+    (KernRe(r"^noinline +"), ""),
+    (KernRe(r"^__FORTIFY_INLINE +"), ""),
+    (KernRe(r"__init +"), ""),
+    (KernRe(r"__init_or_module +"), ""),
+    (KernRe(r"__deprecated +"), ""),
+    (KernRe(r"__flatten +"), ""),
+    (KernRe(r"__meminit +"), ""),
+    (KernRe(r"__must_check +"), ""),
+    (KernRe(r"__weak +"), ""),
+    (KernRe(r"__sched +"), ""),
+    (KernRe(r"_noprof"), ""),
+    (KernRe(r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +"), ""),
+    (KernRe(r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +"), ""),
+    (KernRe(r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +"), ""),
+    (KernRe(r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)"), r"\1, \2"),
+    (KernRe(r"__attribute_const__ +"), ""),
+    (KernRe(r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+"), ""),
+]
+
+#
+# Apply a set of transforms to a block of text.
+#
+def apply_transforms(xforms, text):
+    for search, subst in xforms:
+        text = search.sub(subst, text)
+    return text
+
+#
 # A little helper to get rid of excess white space
 #
 multi_space = KernRe(r'\s\s+')
 def trim_whitespace(s):
     return multi_space.sub(' ', s.strip())
 
+#
+# Remove struct/enum members that have been marked "private".
+#
+def trim_private_members(text):
+    #
+    # First look for a "public:" block that ends a private region, then
+    # handle the "private until the end" case.
+    #
+    text = KernRe(r'/\*\s*private:.*?/\*\s*public:.*?\*/', flags=re.S).sub('', text)
+    text = KernRe(r'/\*\s*private:.*', flags=re.S).sub('', text)
+    #
+    # We needed the comments to do the above, but now we can take them out.
+    #
+    return KernRe(r'\s*/\*.*?\*/\s*', flags=re.S).sub('', text).strip()
+
 class state:
     """
     State machine enums
@@ -318,36 +458,26 @@ class KernelDoc:
 
         param = KernRe(r'[\[\)].*').sub('', param, count=1)
 
-        if dtype == "" and param.endswith("..."):
-            if KernRe(r'\w\.\.\.$').search(param):
-                # For named variable parameters of the form `x...`,
-                # remove the dots
-                param = param[:-3]
-            else:
-                # Handles unnamed variable parameters
-                param = "..."
-
-            if param not in self.entry.parameterdescs or \
-                not self.entry.parameterdescs[param]:
-
-                self.entry.parameterdescs[param] = "variable arguments"
-
-        elif dtype == "" and (not param or param == "void"):
-            param = "void"
-            self.entry.parameterdescs[param] = "no arguments"
-
-        elif dtype == "" and param in ["struct", "union"]:
-            # Handle unnamed (anonymous) union or struct
-            dtype = param
-            param = "{unnamed_" + param + "}"
-            self.entry.parameterdescs[param] = "anonymous\n"
-            self.entry.anon_struct_union = True
-
-        # Handle cache group enforcing variables: they do not need
-        # to be described in header files
-        elif "__cacheline_group" in param:
-            # Ignore __cacheline_group_begin and __cacheline_group_end
-            return
+        #
+        # Look at various "anonymous type" cases.
+        #
+        if dtype == '':
+            if param.endswith("..."):
+                if len(param) > 3: # there is a name provided, use that
+                    param = param[:-3]
+                if not self.entry.parameterdescs.get(param):
+                    self.entry.parameterdescs[param] = "variable arguments"
+
+            elif (not param) or param == "void":
+                param = "void"
+                self.entry.parameterdescs[param] = "no arguments"
+
+            elif param in ["struct", "union"]:
+                # Handle unnamed (anonymous) union or struct
+                dtype = param
+                param = "{unnamed_" + param + "}"
+                self.entry.parameterdescs[param] = "anonymous\n"
+                self.entry.anon_struct_union = True
 
         # Warn if parameter has no description
         # (but ignore ones starting with # as these are not parameters
@@ -389,9 +519,6 @@ class KernelDoc:
             args = arg_expr.sub(r"\1#", args)
 
         for arg in args.split(splitter):
-            # Strip comments
-            arg = KernRe(r'\/\*.*\*\/').sub('', arg)
-
             # Ignore argument attributes
             arg = KernRe(r'\sPOS0?\s').sub(' ', arg)
 
@@ -407,81 +534,76 @@ class KernelDoc:
                 # Treat preprocessor directive as a typeless variable
                 self.push_parameter(ln, decl_type, arg, "",
                                     "", declaration_name)
-
+            #
+            # The pointer-to-function case.
+            #
             elif KernRe(r'\(.+\)\s*\(').search(arg):
-                # Pointer-to-function
-
                 arg = arg.replace('#', ',')
-
-                r = KernRe(r'[^\(]+\(\*?\s*([\w\[\]\.]*)\s*\)')
+                r = KernRe(r'[^\(]+\(\*?\s*'  # Everything up to "(*"
+                           r'([\w\[\].]*)'    # Capture the name and possible [array]
+                           r'\s*\)')	      # Make sure the trailing ")" is there
                 if r.match(arg):
                     param = r.group(1)
                 else:
                     self.emit_msg(ln, f"Invalid param: {arg}")
                     param = arg
-
-                dtype = KernRe(r'([^\(]+\(\*?)\s*' + re.escape(param)).sub(r'\1', arg)
-                self.push_parameter(ln, decl_type, param, dtype,
-                                    arg, declaration_name)
-
+                dtype = arg.replace(param, '')
+                self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name)
+            #
+            # The array-of-pointers case.  Dig the parameter name out from the middle
+            # of the declaration.
+            #
             elif KernRe(r'\(.+\)\s*\[').search(arg):
-                # Array-of-pointers
-
-                arg = arg.replace('#', ',')
-                r = KernRe(r'[^\(]+\(\s*\*\s*([\w\[\]\.]*?)\s*(\s*\[\s*[\w]+\s*\]\s*)*\)')
+                r = KernRe(r'[^\(]+\(\s*\*\s*'		# Up to "(" and maybe "*"
+                           r'([\w.]*?)'			# The actual pointer name
+                           r'\s*(\[\s*\w+\s*\]\s*)*\)') # The [array portion]
                 if r.match(arg):
                     param = r.group(1)
                 else:
                     self.emit_msg(ln, f"Invalid param: {arg}")
                     param = arg
-
-                dtype = KernRe(r'([^\(]+\(\*?)\s*' + re.escape(param)).sub(r'\1', arg)
-
-                self.push_parameter(ln, decl_type, param, dtype,
-                                    arg, declaration_name)
-
+                dtype = arg.replace(param, '')
+                self.push_parameter(ln, decl_type, param, dtype, arg, declaration_name)
             elif arg:
+                #
+                # Clean up extraneous spaces and split the string at commas; the first
+                # element of the resulting list will also include the type information.
+                #
                 arg = KernRe(r'\s*:\s*').sub(":", arg)
                 arg = KernRe(r'\s*\[').sub('[', arg)
-
                 args = KernRe(r'\s*,\s*').split(arg)
-                if args[0] and '*' in args[0]:
-                    args[0] = re.sub(r'(\*+)\s*', r' \1', args[0])
-
-                first_arg = []
-                r = KernRe(r'^(.*\s+)(.*?\[.*\].*)$')
-                if args[0] and r.match(args[0]):
-                    args.pop(0)
-                    first_arg.extend(r.group(1))
-                    first_arg.append(r.group(2))
+                args[0] = re.sub(r'(\*+)\s*', r' \1', args[0])
+                #
+                # args[0] has a string of "type a".  If "a" includes an [array]
+                # declaration, we want to not be fooled by any white space inside
+                # the brackets, so detect and handle that case specially.
+                #
+                r = KernRe(r'^([^[\]]*\s+)(.*)$')
+                if r.match(args[0]):
+                    args[0] = r.group(2)
+                    dtype = r.group(1)
                 else:
-                    first_arg = KernRe(r'\s+').split(args.pop(0))
-
-                args.insert(0, first_arg.pop())
-                dtype = ' '.join(first_arg)
+                    # No space in args[0]; this seems wrong but preserves previous behavior
+                    dtype = ''
 
+                bitfield_re = KernRe(r'(.*?):(\w+)')
                 for param in args:
-                    if KernRe(r'^(\*+)\s*(.*)').match(param):
-                        r = KernRe(r'^(\*+)\s*(.*)')
-                        if not r.match(param):
-                            self.emit_msg(ln, f"Invalid param: {param}")
-                            continue
-
-                        param = r.group(1)
-
+                    #
+                    # For pointers, shift the star(s) from the variable name to the
+                    # type declaration.
+                    #
+                    r = KernRe(r'^(\*+)\s*(.*)')
+                    if r.match(param):
                         self.push_parameter(ln, decl_type, r.group(2),
                                             f"{dtype} {r.group(1)}",
                                             arg, declaration_name)
-
-                    elif KernRe(r'(.*?):(\w+)').search(param):
-                        r = KernRe(r'(.*?):(\w+)')
-                        if not r.match(param):
-                            self.emit_msg(ln, f"Invalid param: {param}")
-                            continue
-
+                    #
+                    # Perform a similar shift for bitfields.
+                    #
+                    elif bitfield_re.search(param):
                         if dtype != "":  # Skip unnamed bit-fields
-                            self.push_parameter(ln, decl_type, r.group(1),
-                                                f"{dtype}:{r.group(2)}",
+                            self.push_parameter(ln, decl_type, bitfield_re.group(1),
+                                                f"{dtype}:{bitfield_re.group(2)}",
                                                 arg, declaration_name)
                     else:
                         self.push_parameter(ln, decl_type, param, dtype,
@@ -520,13 +642,11 @@ class KernelDoc:
             self.emit_msg(ln,
                           f"No description found for return value of '{declaration_name}'")
 
-    def dump_struct(self, ln, proto):
-        """
-        Store an entry for an struct or union
-        """
-
+    #
+    # Split apart a structure prototype; returns (struct|union, name, members) or None
+    #
+    def split_struct_proto(self, proto):
         type_pattern = r'(struct|union)'
-
         qualifiers = [
             "__attribute__",
             "__packed",
@@ -534,288 +654,202 @@ class KernelDoc:
             "____cacheline_aligned_in_smp",
             "____cacheline_aligned",
         ]
-
         definition_body = r'\{(.*)\}\s*' + "(?:" + '|'.join(qualifiers) + ")?"
-        struct_members = KernRe(type_pattern + r'([^\{\};]+)(\{)([^\{\}]*)(\})([^\{\}\;]*)(\;)')
-
-        # Extract struct/union definition
-        members = None
-        declaration_name = None
-        decl_type = None
 
         r = KernRe(type_pattern + r'\s+(\w+)\s*' + definition_body)
         if r.search(proto):
-            decl_type = r.group(1)
-            declaration_name = r.group(2)
-            members = r.group(3)
+            return (r.group(1), r.group(2), r.group(3))
         else:
             r = KernRe(r'typedef\s+' + type_pattern + r'\s*' + definition_body + r'\s*(\w+)\s*;')
-
             if r.search(proto):
-                decl_type = r.group(1)
-                declaration_name = r.group(3)
-                members = r.group(2)
-
-        if not members:
-            self.emit_msg(ln, f"{proto} error: Cannot parse struct or union!")
-            return
-
-        if self.entry.identifier != declaration_name:
-            self.emit_msg(ln,
-                          f"expecting prototype for {decl_type} {self.entry.identifier}. Prototype was for {decl_type} {declaration_name} instead\n")
-            return
-
-        args_pattern = r'([^,)]+)'
-
-        sub_prefixes = [
-            (KernRe(r'\/\*\s*private:.*?\/\*\s*public:.*?\*\/', re.S | re.I), ''),
-            (KernRe(r'\/\*\s*private:.*', re.S | re.I), ''),
-
-            # Strip comments
-            (KernRe(r'\/\*.*?\*\/', re.S), ''),
-
-            # Strip attributes
-            (attribute, ' '),
-            (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
-            (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
-            (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
-            (KernRe(r'\s*__packed\s*', re.S), ' '),
-            (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
-            (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
-            (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
-
-            # Unwrap struct_group macros based on this definition:
-            # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
-            # which has variants like: struct_group(NAME, MEMBERS...)
-            # Only MEMBERS arguments require documentation.
-            #
-            # Parsing them happens on two steps:
-            #
-            # 1. drop struct group arguments that aren't at MEMBERS,
-            #    storing them as STRUCT_GROUP(MEMBERS)
-            #
-            # 2. remove STRUCT_GROUP() ancillary macro.
-            #
-            # The original logic used to remove STRUCT_GROUP() using an
-            # advanced regex:
-            #
-            #   \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
-            #
-            # with two patterns that are incompatible with
-            # Python re module, as it has:
-            #
-            #   - a recursive pattern: (?1)
-            #   - an atomic grouping: (?>...)
-            #
-            # I tried a simpler version: but it didn't work either:
-            #   \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
-            #
-            # As it doesn't properly match the end parenthesis on some cases.
-            #
-            # So, a better solution was crafted: there's now a NestedMatch
-            # class that ensures that delimiters after a search are properly
-            # matched. So, the implementation to drop STRUCT_GROUP() will be
-            # handled in separate.
-
-            (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
-            (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
-            (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
-            (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
-
-            # Replace macros
-            #
-            # TODO: use NestedMatch for FOO($1, $2, ...) matches
-            #
-            # it is better to also move those to the NestedMatch logic,
-            # to ensure that parenthesis will be properly matched.
-
-            (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S), r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
-            (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S), r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
-            (KernRe(r'DECLARE_BITMAP\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
-            (KernRe(r'DECLARE_HASHTABLE\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
-            (KernRe(r'DECLARE_KFIFO\s*\(' + args_pattern + r',\s*' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\2 *\1'),
-            (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\2 *\1'),
-            (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + args_pattern + r',\s*' + args_pattern + r'\)', re.S), r'\1 \2[]'),
-            (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + args_pattern + r'\)', re.S), r'dma_addr_t \1'),
-            (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + args_pattern + r'\)', re.S), r'__u32 \1'),
-            (KernRe(r'VIRTIO_DECLARE_FEATURES\s*\(' + args_pattern + r'\)', re.S), r'u64 \1; u64 \1_array[VIRTIO_FEATURES_DWORDS]'),
-        ]
-
-        # Regexes here are guaranteed to have the end limiter matching
-        # the start delimiter. Yet, right now, only one replace group
-        # is allowed.
-
-        sub_nested_prefixes = [
-            (re.compile(r'\bSTRUCT_GROUP\('), r'\1'),
-        ]
-
-        for search, sub in sub_prefixes:
-            members = search.sub(sub, members)
-
-        nested = NestedMatch()
-
-        for search, sub in sub_nested_prefixes:
-            members = nested.sub(search, sub, members)
-
-        # Keeps the original declaration as-is
-        declaration = members
-
-        # Split nested struct/union elements
-        #
-        # This loop was simpler at the original kernel-doc perl version, as
-        #   while ($members =~ m/$struct_members/) { ... }
-        # reads 'members' string on each interaction.
-        #
-        # Python behavior is different: it parses 'members' only once,
-        # creating a list of tuples from the first interaction.
+                return (r.group(1), r.group(3), r.group(2))
+        return None
+    #
+    # Rewrite the members of a structure or union for easier formatting later on.
+    # Among other things, this function will turn a member like:
+    #
+    #  struct { inner_members; } foo;
+    #
+    # into:
+    #
+    #  struct foo; inner_members;
+    #
+    def rewrite_struct_members(self, members):
         #
-        # On other words, this won't get nested structs.
+        # Process struct/union members from the most deeply nested outward.  The
+        # trick is in the ^{ below - it prevents a match of an outer struct/union
+        # until the inner one has been munged (removing the "{" in the process).
         #
-        # So, we need to have an extra loop on Python to override such
-        # re limitation.
-
-        while True:
-            tuples = struct_members.findall(members)
-            if not tuples:
-                break
-
+        struct_members = KernRe(r'(struct|union)'   # 0: declaration type
+                                r'([^\{\};]+)' 	    # 1: possible name
+                                r'(\{)'
+                                r'([^\{\}]*)'       # 3: Contents of declaration
+                                r'(\})'
+                                r'([^\{\};]*)(;)')  # 5: Remaining stuff after declaration
+        tuples = struct_members.findall(members)
+        while tuples:
             for t in tuples:
                 newmember = ""
-                maintype = t[0]
-                s_ids = t[5]
-                content = t[3]
-
-                oldmember = "".join(t)
-
-                for s_id in s_ids.split(','):
+                oldmember = "".join(t) # Reconstruct the original formatting
+                dtype, name, lbr, content, rbr, rest, semi = t
+                #
+                # Pass through each field name, normalizing the form and formatting.
+                #
+                for s_id in rest.split(','):
                     s_id = s_id.strip()
-
-                    newmember += f"{maintype} {s_id}; "
+                    newmember += f"{dtype} {s_id}; "
+                    #
+                    # Remove bitfield/array/pointer info, getting the bare name.
+                    #
                     s_id = KernRe(r'[:\[].*').sub('', s_id)
                     s_id = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', s_id)
-
+                    #
+                    # Pass through the members of this inner structure/union.
+                    #
                     for arg in content.split(';'):
                         arg = arg.strip()
-
-                        if not arg:
-                            continue
-
-                        r = KernRe(r'^([^\(]+\(\*?\s*)([\w\.]*)(\s*\).*)')
+                        #
+                        # Look for (type)(*name)(args) - pointer to function
+                        #
+                        r = KernRe(r'^([^\(]+\(\*?\s*)([\w.]*)(\s*\).*)')
                         if r.match(arg):
+                            dtype, name, extra = r.group(1), r.group(2), r.group(3)
                             # Pointer-to-function
-                            dtype = r.group(1)
-                            name = r.group(2)
-                            extra = r.group(3)
-
-                            if not name:
-                                continue
-
                             if not s_id:
                                 # Anonymous struct/union
                                 newmember += f"{dtype}{name}{extra}; "
                             else:
                                 newmember += f"{dtype}{s_id}.{name}{extra}; "
-
+                        #
+                        # Otherwise a non-function member.
+                        #
                         else:
-                            arg = arg.strip()
-                            # Handle bitmaps
+                            #
+                            # Remove bitmap and array portions and spaces around commas
+                            #
                             arg = KernRe(r':\s*\d+\s*').sub('', arg)
-
-                            # Handle arrays
                             arg = KernRe(r'\[.*\]').sub('', arg)
-
-                            # Handle multiple IDs
                             arg = KernRe(r'\s*,\s*').sub(',', arg)
-
+                            #
+                            # Look for a normal decl - "type name[,name...]"
+                            #
                             r = KernRe(r'(.*)\s+([\S+,]+)')
-
                             if r.search(arg):
-                                dtype = r.group(1)
-                                names = r.group(2)
+                                for name in r.group(2).split(','):
+                                    name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name)
+                                    if not s_id:
+                                        # Anonymous struct/union
+                                        newmember += f"{r.group(1)} {name}; "
+                                    else:
+                                        newmember += f"{r.group(1)} {s_id}.{name}; "
                             else:
                                 newmember += f"{arg}; "
-                                continue
-
-                            for name in names.split(','):
-                                name = KernRe(r'^\s*\**(\S+)\s*').sub(r'\1', name).strip()
-
-                                if not name:
-                                    continue
-
-                                if not s_id:
-                                    # Anonymous struct/union
-                                    newmember += f"{dtype} {name}; "
-                                else:
-                                    newmember += f"{dtype} {s_id}.{name}; "
-
+                #
+                # At the end of the s_id loop, replace the original declaration with
+                # the munged version.
+                #
                 members = members.replace(oldmember, newmember)
+            #
+            # End of the tuple loop - search again and see if there are outer members
+            # that now turn up.
+            #
+            tuples = struct_members.findall(members)
+        return members
 
-        # Ignore other nested elements, like enums
-        members = re.sub(r'(\{[^\{\}]*\})', '', members)
-
-        self.create_parameter_list(ln, decl_type, members, ';',
-                                   declaration_name)
-        self.check_sections(ln, declaration_name, decl_type)
-
-        # Adjust declaration for better display
+    #
+    # Format the struct declaration into a standard form for inclusion in the
+    # resulting docs.
+    #
+    def format_struct_decl(self, declaration):
+        #
+        # Insert newlines, get rid of extra spaces.
+        #
         declaration = KernRe(r'([\{;])').sub(r'\1\n', declaration)
         declaration = KernRe(r'\}\s+;').sub('};', declaration)
-
-        # Better handle inlined enums
-        while True:
-            r = KernRe(r'(enum\s+\{[^\}]+),([^\n])')
-            if not r.search(declaration):
-                break
-
+        #
+        # Format inline enums with each member on its own line.
+        #
+        r = KernRe(r'(enum\s+\{[^\}]+),([^\n])')
+        while r.search(declaration):
             declaration = r.sub(r'\1,\n\2', declaration)
-
+        #
+        # Now go through and supply the right number of tabs
+        # for each line.
+        #
         def_args = declaration.split('\n')
         level = 1
         declaration = ""
         for clause in def_args:
+            clause = KernRe(r'\s+').sub(' ', clause.strip(), count=1)
+            if clause:
+                if '}' in clause and level > 1:
+                    level -= 1
+                if not clause.startswith('#'):
+                    declaration += "\t" * level
+                declaration += "\t" + clause + "\n"
+                if "{" in clause and "}" not in clause:
+                    level += 1
+        return declaration
 
-            clause = clause.strip()
-            clause = KernRe(r'\s+').sub(' ', clause, count=1)
-
-            if not clause:
-                continue
-
-            if '}' in clause and level > 1:
-                level -= 1
 
-            if not KernRe(r'^\s*#').match(clause):
-                declaration += "\t" * level
+    def dump_struct(self, ln, proto):
+        """
+        Store an entry for an struct or union
+        """
+        #
+        # Do the basic parse to get the pieces of the declaration.
+        #
+        struct_parts = self.split_struct_proto(proto)
+        if not struct_parts:
+            self.emit_msg(ln, f"{proto} error: Cannot parse struct or union!")
+            return
+        decl_type, declaration_name, members = struct_parts
 
-            declaration += "\t" + clause + "\n"
-            if "{" in clause and "}" not in clause:
-                level += 1
+        if self.entry.identifier != declaration_name:
+            self.emit_msg(ln, f"expecting prototype for {decl_type} {self.entry.identifier}. "
+                          f"Prototype was for {decl_type} {declaration_name} instead\n")
+            return
+        #
+        # Go through the list of members applying all of our transformations.
+        #
+        members = trim_private_members(members)
+        members = apply_transforms(struct_xforms, members)
 
+        nested = NestedMatch()
+        for search, sub in struct_nested_prefixes:
+            members = nested.sub(search, sub, members)
+        #
+        # Deal with embedded struct and union members, and drop enums entirely.
+        #
+        declaration = members
+        members = self.rewrite_struct_members(members)
+        members = re.sub(r'(\{[^\{\}]*\})', '', members)
+        #
+        # Output the result and we are done.
+        #
+        self.create_parameter_list(ln, decl_type, members, ';',
+                                   declaration_name)
+        self.check_sections(ln, declaration_name, decl_type)
         self.output_declaration(decl_type, declaration_name,
-                                definition=declaration,
+                                definition=self.format_struct_decl(declaration),
                                 purpose=self.entry.declaration_purpose)
 
     def dump_enum(self, ln, proto):
         """
         Stores an enum inside self.entries array.
         """
-
-        # Ignore members marked private
-        proto = KernRe(r'\/\*\s*private:.*?\/\*\s*public:.*?\*\/', flags=re.S).sub('', proto)
-        proto = KernRe(r'\/\*\s*private:.*}', flags=re.S).sub('}', proto)
-
-        # Strip comments
-        proto = KernRe(r'\/\*.*?\*\/', flags=re.S).sub('', proto)
-
-        # Strip #define macros inside enums
+        #
+        # Strip preprocessor directives.  Note that this depends on the
+        # trailing semicolon we added in process_proto_type().
+        #
         proto = KernRe(r'#\s*((define|ifdef|if)\s+|endif)[^;]*;', flags=re.S).sub('', proto)
-
         #
         # Parse out the name and members of the enum.  Typedef form first.
         #
         r = KernRe(r'typedef\s+enum\s*\{(.*)\}\s*(\w*)\s*;')
         if r.search(proto):
             declaration_name = r.group(2)
-            members = r.group(1).rstrip()
+            members = trim_private_members(r.group(1))
         #
         # Failing that, look for a straight enum
         #
@@ -823,7 +857,7 @@ class KernelDoc:
             r = KernRe(r'enum\s+(\w*)\s*\{(.*)\}')
             if r.match(proto):
                 declaration_name = r.group(1)
-                members = r.group(2).rstrip()
+                members = trim_private_members(r.group(2))
         #
         # OK, this isn't going to work.
         #
@@ -892,62 +926,31 @@ class KernelDoc:
         Stores a function of function macro inside self.entries array.
         """
 
-        func_macro = False
+        found = func_macro = False
         return_type = ''
         decl_type = 'function'
-
-        # Prefixes that would be removed
-        sub_prefixes = [
-            (r"^static +", "", 0),
-            (r"^extern +", "", 0),
-            (r"^asmlinkage +", "", 0),
-            (r"^inline +", "", 0),
-            (r"^__inline__ +", "", 0),
-            (r"^__inline +", "", 0),
-            (r"^__always_inline +", "", 0),
-            (r"^noinline +", "", 0),
-            (r"^__FORTIFY_INLINE +", "", 0),
-            (r"__init +", "", 0),
-            (r"__init_or_module +", "", 0),
-            (r"__deprecated +", "", 0),
-            (r"__flatten +", "", 0),
-            (r"__meminit +", "", 0),
-            (r"__must_check +", "", 0),
-            (r"__weak +", "", 0),
-            (r"__sched +", "", 0),
-            (r"_noprof", "", 0),
-            (r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +", "", 0),
-            (r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +", "", 0),
-            (r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +", "", 0),
-            (r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)", r"\1, \2", 0),
-            (r"__attribute_const__ +", "", 0),
-
-            # It seems that Python support for re.X is broken:
-            # At least for me (Python 3.13), this didn't work
-#            (r"""
-#              __attribute__\s*\(\(
-#                (?:
-#                    [\w\s]+          # attribute name
-#                    (?:\([^)]*\))?   # attribute arguments
-#                    \s*,?            # optional comma at the end
-#                )+
-#              \)\)\s+
-#             """, "", re.X),
-
-            # So, remove whitespaces and comments from it
-            (r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+", "", 0),
-        ]
-
-        for search, sub, flags in sub_prefixes:
-            prototype = KernRe(search, flags).sub(sub, prototype)
-
-        # Macros are a special case, as they change the prototype format
+        #
+        # Apply the initial transformations.
+        #
+        prototype = apply_transforms(function_xforms, prototype)
+        #
+        # If we have a macro, remove the "#define" at the front.
+        #
         new_proto = KernRe(r"^#\s*define\s+").sub("", prototype)
         if new_proto != prototype:
-            is_define_proto = True
             prototype = new_proto
-        else:
-            is_define_proto = False
+            #
+            # Dispense with the simple "#define A B" case here; the key
+            # is the space after the name of the symbol being defined.
+            # NOTE that the seemingly misnamed "func_macro" indicates a
+            # macro *without* arguments.
+            #
+            r = KernRe(r'^(\w+)\s+')
+            if r.search(prototype):
+                return_type = ''
+                declaration_name = r.group(1)
+                func_macro = True
+                found = True
 
         # Yes, this truly is vile.  We are looking for:
         # 1. Return type (may be nothing if we're looking at a macro)
@@ -965,91 +968,73 @@ class KernelDoc:
         # - atomic_set (macro)
         # - pci_match_device, __copy_to_user (long return type)
 
-        name = r'[a-zA-Z0-9_~:]+'
-        prototype_end1 = r'[^\(]*'
-        prototype_end2 = r'[^\{]*'
-        prototype_end = fr'\(({prototype_end1}|{prototype_end2})\)'
-
-        # Besides compiling, Perl qr{[\w\s]+} works as a non-capturing group.
-        # So, this needs to be mapped in Python with (?:...)? or (?:...)+
-
+        name = r'\w+'
         type1 = r'(?:[\w\s]+)?'
         type2 = r'(?:[\w\s]+\*+)+'
-
-        found = False
-
-        if is_define_proto:
-            r = KernRe(r'^()(' + name + r')\s+')
-
-            if r.search(prototype):
-                return_type = ''
-                declaration_name = r.group(2)
-                func_macro = True
-
-                found = True
-
+        #
+        # Attempt to match first on (args) with no internal parentheses; this
+        # lets us easily filter out __acquires() and other post-args stuff.  If
+        # that fails, just grab the rest of the line to the last closing
+        # parenthesis.
+        #
+        proto_args = r'\(([^\(]*|.*)\)'
+        #
+        # (Except for the simple macro case) attempt to split up the prototype
+        # in the various ways we understand.
+        #
         if not found:
             patterns = [
-                rf'^()({name})\s*{prototype_end}',
-                rf'^({type1})\s+({name})\s*{prototype_end}',
-                rf'^({type2})\s*({name})\s*{prototype_end}',
+                rf'^()({name})\s*{proto_args}',
+                rf'^({type1})\s+({name})\s*{proto_args}',
+                rf'^({type2})\s*({name})\s*{proto_args}',
             ]
 
             for p in patterns:
                 r = KernRe(p)
-
                 if r.match(prototype):
-
                     return_type = r.group(1)
                     declaration_name = r.group(2)
                     args = r.group(3)
-
                     self.create_parameter_list(ln, decl_type, args, ',',
                                                declaration_name)
-
                     found = True
                     break
+        #
+        # Parsing done; make sure that things are as we expect.
+        #
         if not found:
             self.emit_msg(ln,
                           f"cannot understand function prototype: '{prototype}'")
             return
-
         if self.entry.identifier != declaration_name:
-            self.emit_msg(ln,
-                          f"expecting prototype for {self.entry.identifier}(). Prototype was for {declaration_name}() instead")
+            self.emit_msg(ln, f"expecting prototype for {self.entry.identifier}(). "
+                          f"Prototype was for {declaration_name}() instead")
             return
-
         self.check_sections(ln, declaration_name, "function")
-
         self.check_return_section(ln, declaration_name, return_type)
+        #
+        # Store the result.
+        #
+        self.output_declaration(decl_type, declaration_name,
+                                typedef=('typedef' in return_type),
+                                functiontype=return_type,
+                                purpose=self.entry.declaration_purpose,
+                                func_macro=func_macro)
 
-        if 'typedef' in return_type:
-            self.output_declaration(decl_type, declaration_name,
-                                    typedef=True,
-                                    functiontype=return_type,
-                                    purpose=self.entry.declaration_purpose,
-                                    func_macro=func_macro)
-        else:
-            self.output_declaration(decl_type, declaration_name,
-                                    typedef=False,
-                                    functiontype=return_type,
-                                    purpose=self.entry.declaration_purpose,
-                                    func_macro=func_macro)
 
     def dump_typedef(self, ln, proto):
         """
         Stores a typedef inside self.entries array.
         """
-
-        typedef_type = r'((?:\s+[\w\*]+\b){0,7}\s+(?:\w+\b|\*+))\s*'
+        #
+        # We start by looking for function typedefs.
+        #
+        typedef_type = r'typedef((?:\s+[\w*]+\b){0,7}\s+(?:\w+\b|\*+))\s*'
         typedef_ident = r'\*?\s*(\w\S+)\s*'
         typedef_args = r'\s*\((.*)\);'
 
-        typedef1 = KernRe(r'typedef' + typedef_type + r'\(' + typedef_ident + r'\)' + typedef_args)
-        typedef2 = KernRe(r'typedef' + typedef_type + typedef_ident + typedef_args)
-
-        # Strip comments
-        proto = KernRe(r'/\*.*?\*/', flags=re.S).sub('', proto)
+        typedef1 = KernRe(typedef_type + r'\(' + typedef_ident + r'\)' + typedef_args)
+        typedef2 = KernRe(typedef_type + typedef_ident + typedef_args)
 
         # Parse function typedef prototypes
         for r in [typedef1, typedef2]:
@@ -1065,21 +1050,16 @@ class KernelDoc:
                               f"expecting prototype for typedef {self.entry.identifier}. Prototype was for typedef {declaration_name} instead\n")
                 return
 
-            decl_type = 'function'
-            self.create_parameter_list(ln, decl_type, args, ',', declaration_name)
+            self.create_parameter_list(ln, 'function', args, ',', declaration_name)
 
-            self.output_declaration(decl_type, declaration_name,
+            self.output_declaration('function', declaration_name,
                                     typedef=True,
                                     functiontype=return_type,
                                     purpose=self.entry.declaration_purpose)
             return
-
-        # Handle nested parentheses or brackets
-        r = KernRe(r'(\(*.\)\s*|\[*.\]\s*);$')
-        while r.search(proto):
-            proto = r.sub('', proto)
-
-        # Parse simple typedefs
+        #
+        # Not a function, try to parse a simple typedef.
+        #
         r = KernRe(r'typedef.*\s+(\w+)\s*;')
         if r.match(proto):
             declaration_name = r.group(1)
@@ -1262,7 +1242,7 @@ class KernelDoc:
             self.dump_section()
 
             # Look for doc_com + <text> + doc_end:
-            r = KernRe(r'\s*\*\s*[a-zA-Z_0-9:\.]+\*/')
+            r = KernRe(r'\s*\*\s*[a-zA-Z_0-9:.]+\*/')
             if r.match(line):
                 self.emit_msg(ln, f"suspicious ending line: {line}")
 
@@ -1473,7 +1453,7 @@ class KernelDoc:
         """Ancillary routine to process a function prototype"""
 
         # strip C99-style comments to end of line
-        line = KernRe(r"\/\/.*$", re.S).sub('', line)
+        line = KernRe(r"//.*$", re.S).sub('', line)
         #
         # Soak up the line's worth of prototype text, stopping at { or ; if present.
         #
diff --git a/scripts/selinux/install_policy.sh b/scripts/selinux/install_policy.sh
index db40237e60ce..77368a73f111 100755
--- a/scripts/selinux/install_policy.sh
+++ b/scripts/selinux/install_policy.sh
@@ -74,7 +74,7 @@ cd /etc/selinux/dummy/contexts/files
 $SF -F file_contexts /
 
 mounts=`cat /proc/$$/mounts | \
-	grep -E "ext[234]|jfs|xfs|reiserfs|jffs2|gfs2|btrfs|f2fs|ocfs2" | \
+	grep -E "ext[234]|jfs|xfs|jffs2|gfs2|btrfs|f2fs|ocfs2" | \
 	awk '{ print $2 '}`
 $SF -F file_contexts $mounts
 
diff --git a/scripts/sphinx-build-wrapper b/scripts/sphinx-build-wrapper
new file mode 100755
index 000000000000..abe8c26ae137
--- /dev/null
+++ b/scripts/sphinx-build-wrapper
@@ -0,0 +1,719 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#
+# pylint: disable=R0902, R0912, R0913, R0914, R0915, R0917, C0103
+#
+# Converted from docs Makefile and parallel-wrapper.sh, both under
+# GPLv2, copyrighted since 2008 by the following authors:
+#
+#    Akira Yokosawa <akiyks@gmail.com>
+#    Arnd Bergmann <arnd@arndb.de>
+#    Breno Leitao <leitao@debian.org>
+#    Carlos Bilbao <carlos.bilbao@amd.com>
+#    Dave Young <dyoung@redhat.com>
+#    Donald Hunter <donald.hunter@gmail.com>
+#    Geert Uytterhoeven <geert+renesas@glider.be>
+#    Jani Nikula <jani.nikula@intel.com>
+#    Jan Stancek <jstancek@redhat.com>
+#    Jonathan Corbet <corbet@lwn.net>
+#    Joshua Clayton <stillcompiling@gmail.com>
+#    Kees Cook <keescook@chromium.org>
+#    Linus Torvalds <torvalds@linux-foundation.org>
+#    Magnus Damm <damm+renesas@opensource.se>
+#    Masahiro Yamada <masahiroy@kernel.org>
+#    Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+#    Maxim Cournoyer <maxim.cournoyer@gmail.com>
+#    Peter Foley <pefoley2@pefoley.com>
+#    Randy Dunlap <rdunlap@infradead.org>
+#    Rob Herring <robh@kernel.org>
+#    Shuah Khan <shuahkh@osg.samsung.com>
+#    Thorsten Blum <thorsten.blum@toblux.com>
+#    Tomas Winkler <tomas.winkler@intel.com>
+
+
+"""
+Sphinx build wrapper that handles Kernel-specific business rules:
+
+- it gets the Kernel build environment vars;
+- it determines what's the best parallelism;
+- it handles SPHINXDIRS
+
+This tool ensures that MIN_PYTHON_VERSION is satisfied. If version is
+below that, it seeks for a new Python version. If found, it re-runs using
+the newer version.
+"""
+
+import argparse
+import locale
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+
+from concurrent import futures
+from glob import glob
+
+LIB_DIR = "lib"
+SRC_DIR = os.path.dirname(os.path.realpath(__file__))
+
+sys.path.insert(0, os.path.join(SRC_DIR, LIB_DIR))
+
+from jobserver import JobserverExec                  # pylint: disable=C0413
+
+
+def parse_version(version):
+    """Convert a major.minor.patch version into a tuple"""
+    return tuple(int(x) for x in version.split("."))
+
+def ver_str(version):
+    """Returns a version tuple as major.minor.patch"""
+
+    return ".".join([str(x) for x in version])
+
+# Minimal supported Python version needed by Sphinx and its extensions
+MIN_PYTHON_VERSION = parse_version("3.7")
+
+# Default value for --venv parameter
+VENV_DEFAULT = "sphinx_latest"
+
+# List of make targets and its corresponding builder and output directory
+TARGETS = {
+    "cleandocs": {
+        "builder": "clean",
+    },
+    "htmldocs": {
+        "builder": "html",
+    },
+    "epubdocs": {
+        "builder": "epub",
+        "out_dir": "epub",
+    },
+    "texinfodocs": {
+        "builder": "texinfo",
+        "out_dir": "texinfo",
+    },
+    "infodocs": {
+        "builder": "texinfo",
+        "out_dir": "texinfo",
+    },
+    "latexdocs": {
+        "builder": "latex",
+        "out_dir": "latex",
+    },
+    "pdfdocs": {
+        "builder": "latex",
+        "out_dir": "latex",
+    },
+    "xmldocs": {
+        "builder": "xml",
+        "out_dir": "xml",
+    },
+    "linkcheckdocs": {
+        "builder": "linkcheck"
+    },
+}
+
+# Paper sizes. An empty value will pick the default
+PAPER = ["", "a4", "letter"]
+
+class SphinxBuilder:
+    """
+    Handles a sphinx-build target, adding needed arguments to build
+    with the Kernel.
+    """
+
+    def is_rust_enabled(self):
+        """Check if rust is enabled at .config"""
+        config_path = os.path.join(self.srctree, ".config")
+        if os.path.isfile(config_path):
+            with open(config_path, "r", encoding="utf-8") as f:
+                return "CONFIG_RUST=y" in f.read()
+        return False
+
+    def get_path(self, path, abs_path=False):
+        """
+        Ancillary routine to handle patches the right way, as shell does.
+
+        It first expands "~" and "~user". Then, if patch is not absolute,
+        join self.srctree. Finally, if requested, convert to abspath.
+        """
+
+        path = os.path.expanduser(path)
+        if not path.startswith("/"):
+            path = os.path.join(self.srctree, path)
+
+        if abs_path:
+            return os.path.abspath(path)
+
+        return path
+
+    def __init__(self, venv=None, verbose=False, n_jobs=None, interactive=None):
+        """Initialize internal variables"""
+        self.venv = venv
+        self.verbose = None
+
+        # Normal variables passed from Kernel's makefile
+        self.kernelversion = os.environ.get("KERNELVERSION", "unknown")
+        self.kernelrelease = os.environ.get("KERNELRELEASE", "unknown")
+        self.pdflatex = os.environ.get("PDFLATEX", "xelatex")
+
+        if not interactive:
+            self.latexopts = os.environ.get("LATEXOPTS", "-interaction=batchmode -no-shell-escape")
+        else:
+            self.latexopts = os.environ.get("LATEXOPTS", "")
+
+        if not verbose:
+            verbose = bool(os.environ.get("KBUILD_VERBOSE", "") != "")
+
+        # Handle SPHINXOPTS evironment
+        sphinxopts = shlex.split(os.environ.get("SPHINXOPTS", ""))
+
+        # As we handle number of jobs and quiet in separate, we need to pick
+        # it the same way as sphinx-build would pick, so let's use argparse
+        # do to the right argument expansion
+        parser = argparse.ArgumentParser()
+        parser.add_argument('-j', '--jobs', type=int)
+        parser.add_argument('-q', '--quiet', type=int)
+
+        # Other sphinx-build arguments go as-is, so place them
+        # at self.sphinxopts
+        sphinx_args, self.sphinxopts = parser.parse_known_args(sphinxopts)
+        if sphinx_args.quiet == True:
+            self.verbose = False
+
+        if sphinx_args.jobs:
+            self.n_jobs = sphinx_args.jobs
+
+        # Command line arguments was passed, override SPHINXOPTS
+        if verbose is not None:
+            self.verbose = verbose
+
+        self.n_jobs = n_jobs
+
+        # Source tree directory. This needs to be at os.environ, as
+        # Sphinx extensions and media uAPI makefile needs it
+        self.srctree = os.environ.get("srctree")
+        if not self.srctree:
+            self.srctree = "."
+            os.environ["srctree"] = self.srctree
+
+        # Now that we can expand srctree, get other directories as well
+        self.sphinxbuild = os.environ.get("SPHINXBUILD", "sphinx-build")
+        self.kerneldoc = self.get_path(os.environ.get("KERNELDOC",
+                                                      "scripts/kernel-doc.py"))
+        self.obj = os.environ.get("obj", "Documentation")
+        self.builddir = self.get_path(os.path.join(self.obj, "output"),
+                                      abs_path=True)
+
+        # Media uAPI needs it
+        os.environ["BUILDDIR"] = self.builddir
+
+        # Detect if rust is enabled
+        self.config_rust = self.is_rust_enabled()
+
+        # Get directory locations for LaTeX build toolchain
+        self.pdflatex_cmd = shutil.which(self.pdflatex)
+        self.latexmk_cmd = shutil.which("latexmk")
+
+        self.env = os.environ.copy()
+
+        # If venv parameter is specified, run Sphinx from venv
+        if venv:
+            bin_dir = os.path.join(venv, "bin")
+            if os.path.isfile(os.path.join(bin_dir, "activate")):
+                # "activate" virtual env
+                self.env["PATH"] = bin_dir + ":" + self.env["PATH"]
+                self.env["VIRTUAL_ENV"] = venv
+                if "PYTHONHOME" in self.env:
+                    del self.env["PYTHONHOME"]
+                print(f"Setting venv to {venv}")
+            else:
+                sys.exit(f"Venv {venv} not found.")
+
+    def run_sphinx(self, sphinx_build, build_args, *args, **pwargs):
+        """
+        Executes sphinx-build using current python3 command and setting
+        -j parameter if possible to run the build in parallel.
+        """
+
+        with JobserverExec() as jobserver:
+            if jobserver.claim:
+                n_jobs = str(jobserver.claim)
+            else:
+                n_jobs = "auto" # Supported since Sphinx 1.7
+
+            cmd = []
+
+            if self.venv:
+                cmd.append("python")
+            else:
+                cmd.append(sys.executable)
+
+            cmd.append(sphinx_build)
+
+            # if present, SPHINXOPTS or command line --jobs overrides default
+            if self.n_jobs:
+                n_jobs = str(self.n_jobs)
+
+            if n_jobs:
+                cmd += [f"-j{n_jobs}"]
+
+            if not self.verbose:
+                cmd.append("-q")
+
+            cmd += self.sphinxopts
+
+            cmd += build_args
+
+            if self.verbose:
+                print(" ".join(cmd))
+
+            rc = subprocess.call(cmd, *args, **pwargs)
+
+    def handle_html(self, css, output_dir):
+        """
+        Extra steps for HTML and epub output.
+
+        For such targets, we need to ensure that CSS will be properly
+        copied to the output _static directory
+        """
+
+        if not css:
+            return
+
+        css = os.path.expanduser(css)
+        if not css.startswith("/"):
+            css = os.path.join(self.srctree, css)
+
+        static_dir = os.path.join(output_dir, "_static")
+        os.makedirs(static_dir, exist_ok=True)
+
+        try:
+            shutil.copy2(css, static_dir)
+        except (OSError, IOError) as e:
+            print(f"Warning: Failed to copy CSS: {e}", file=sys.stderr)
+
+    def build_pdf_file(self, latex_cmd, from_dir, path):
+        """Builds a single pdf file using latex_cmd"""
+        try:
+            subprocess.run(latex_cmd + [path],
+                            cwd=from_dir, check=True)
+
+            return True
+        except subprocess.CalledProcessError:
+            # LaTeX PDF error code is almost useless: it returns
+            # error codes even when build succeeds but has warnings.
+            # So, we'll ignore the results
+            return False
+
+    def pdf_parallel_build(self, tex_suffix, latex_cmd, tex_files, n_jobs):
+        """Build PDF files in parallel if possible"""
+        builds = {}
+        build_failed = False
+        max_len = 0
+        has_tex = False
+
+        # Process files in parallel
+        with futures.ThreadPoolExecutor(max_workers=n_jobs) as executor:
+            jobs = {}
+
+            for from_dir, pdf_dir, entry in tex_files:
+                name = entry.name
+
+                if not name.endswith(tex_suffix):
+                    continue
+
+                name = name[:-len(tex_suffix)]
+
+                max_len = max(max_len, len(name))
+
+                has_tex = True
+
+                future = executor.submit(self.build_pdf_file, latex_cmd,
+                                         from_dir, entry.path)
+                jobs[future] = (from_dir, name, entry.path)
+
+            for future in futures.as_completed(jobs):
+                from_dir, name, path = jobs[future]
+
+                pdf_name = name + ".pdf"
+                pdf_from = os.path.join(from_dir, pdf_name)
+
+                try:
+                    success = future.result()
+
+                    if success and os.path.exists(pdf_from):
+                        pdf_to = os.path.join(pdf_dir, pdf_name)
+
+                        os.rename(pdf_from, pdf_to)
+                        builds[name] = os.path.relpath(pdf_to, self.builddir)
+                    else:
+                        builds[name] = "FAILED"
+                        build_failed = True
+                except Exception as e:
+                    builds[name] = f"FAILED ({str(e)})"
+                    build_failed = True
+
+        # Handle case where no .tex files were found
+        if not has_tex:
+            name = "Sphinx LaTeX builder"
+            max_len = max(max_len, len(name))
+            builds[name] = "FAILED (no .tex file was generated)"
+            build_failed = True
+
+        return builds, build_failed, max_len
+
+    def handle_pdf(self, output_dirs):
+        """
+        Extra steps for PDF output.
+
+        As PDF is handled via a LaTeX output, after building the .tex file,
+        a new build is needed to create the PDF output from the latex
+        directory.
+        """
+        builds = {}
+        max_len = 0
+        tex_suffix = ".tex"
+
+        # Get all tex files that will be used for PDF build
+        tex_files = []
+        for from_dir in output_dirs:
+            pdf_dir = os.path.join(from_dir, "../pdf")
+            os.makedirs(pdf_dir, exist_ok=True)
+
+            if self.latexmk_cmd:
+                latex_cmd = [self.latexmk_cmd, f"-{self.pdflatex}"]
+            else:
+                latex_cmd = [self.pdflatex]
+
+            latex_cmd.extend(shlex.split(self.latexopts))
+
+            # Get a list of tex files to process
+            with os.scandir(from_dir) as it:
+                for entry in it:
+                    if entry.name.endswith(tex_suffix):
+                        tex_files.append((from_dir, pdf_dir, entry))
+
+        # When using make, this won't be used, as the number of jobs comes
+        # from POSIX jobserver. So, this covers the case where build comes
+        # from command line. On such case, serialize by default, except if
+        # the user explicitly sets the number of jobs.
+        n_jobs = 1
+
+        # n_jobs is either an integer or "auto". Only use it if it is a number
+        if self.n_jobs:
+            try:
+                n_jobs = int(self.n_jobs)
+            except ValueError:
+                pass
+
+        # When using make, jobserver.claim is the number of jobs that were
+        # used with "-j" and that aren't used by other make targets
+        with JobserverExec() as jobserver:
+            n_jobs = 1
+
+            # Handle the case when a parameter is passed via command line,
+            # using it as default, if jobserver doesn't claim anything
+            if self.n_jobs:
+                try:
+                    n_jobs = int(self.n_jobs)
+                except ValueError:
+                    pass
+
+            if jobserver.claim:
+                n_jobs = jobserver.claim
+
+            # Build files in parallel
+            builds, build_failed, max_len = self.pdf_parallel_build(tex_suffix,
+                                                                    latex_cmd,
+                                                                    tex_files,
+                                                                    n_jobs)
+
+        msg = "Summary"
+        msg += "\n" + "=" * len(msg)
+        print()
+        print(msg)
+
+        for pdf_name, pdf_file in builds.items():
+            print(f"{pdf_name:<{max_len}}: {pdf_file}")
+
+        print()
+
+        # return an error if a PDF file is missing
+
+        if build_failed:
+            sys.exit(f"PDF build failed: not all PDF files were created.")
+        else:
+            print("All PDF files were built.")
+
+    def handle_info(self, output_dirs):
+        """
+        Extra steps for Info output.
+
+        For texinfo generation, an additional make is needed from the
+        texinfo directory.
+        """
+
+        for output_dir in output_dirs:
+            try:
+                subprocess.run(["make", "info"], cwd=output_dir, check=True)
+            except subprocess.CalledProcessError as e:
+                sys.exit(f"Error generating info docs: {e}")
+
+    def cleandocs(self, builder):
+
+        shutil.rmtree(self.builddir, ignore_errors=True)
+
+    def build(self, target, sphinxdirs=None, conf="conf.py",
+              theme=None, css=None, paper=None):
+        """
+        Build documentation using Sphinx. This is the core function of this
+        module. It prepares all arguments required by sphinx-build.
+        """
+
+        builder = TARGETS[target]["builder"]
+        out_dir = TARGETS[target].get("out_dir", "")
+
+        # Cleandocs doesn't require sphinx-build
+        if target == "cleandocs":
+            self.cleandocs(builder)
+            return
+
+        # Other targets require sphinx-build
+        sphinxbuild = shutil.which(self.sphinxbuild, path=self.env["PATH"])
+        if not sphinxbuild:
+            sys.exit(f"Error: {self.sphinxbuild} not found in PATH.\n")
+
+        if builder == "latex":
+            if not self.pdflatex_cmd and not self.latexmk_cmd:
+                sys.exit("Error: pdflatex or latexmk required for PDF generation")
+
+        docs_dir = os.path.abspath(os.path.join(self.srctree, "Documentation"))
+
+        # Prepare base arguments for Sphinx build
+        kerneldoc = self.kerneldoc
+        if kerneldoc.startswith(self.srctree):
+            kerneldoc = os.path.relpath(kerneldoc, self.srctree)
+
+        # Prepare common Sphinx options
+        args = [
+            "-b", builder,
+            "-c", docs_dir,
+        ]
+
+        if builder == "latex":
+            if not paper:
+                paper = PAPER[1]
+
+            args.extend(["-D", f"latex_elements.papersize={paper}paper"])
+
+        if self.config_rust:
+            args.extend(["-t", "rustdoc"])
+
+        if conf:
+            self.env["SPHINX_CONF"] = self.get_path(conf, abs_path=True)
+
+        if not sphinxdirs:
+            sphinxdirs = os.environ.get("SPHINXDIRS", ".")
+
+        # The sphinx-build tool has a bug: internally, it tries to set
+        # locale with locale.setlocale(locale.LC_ALL, ''). This causes a
+        # crash if language is not set. Detect and fix it.
+        try:
+            locale.setlocale(locale.LC_ALL, '')
+        except Exception:
+            self.env["LC_ALL"] = "C"
+            self.env["LANG"] = "C"
+
+        # sphinxdirs can be a list or a whitespace-separated string
+        sphinxdirs_list = []
+        for sphinxdir in sphinxdirs:
+            if isinstance(sphinxdir, list):
+                sphinxdirs_list += sphinxdir
+            else:
+                for name in sphinxdir.split(" "):
+                    sphinxdirs_list.append(name)
+
+        # Build each directory
+        output_dirs = []
+        for sphinxdir in sphinxdirs_list:
+            src_dir = os.path.join(docs_dir, sphinxdir)
+            doctree_dir = os.path.join(self.builddir, ".doctrees")
+            output_dir = os.path.join(self.builddir, sphinxdir, out_dir)
+
+            # Make directory names canonical
+            src_dir = os.path.normpath(src_dir)
+            doctree_dir = os.path.normpath(doctree_dir)
+            output_dir = os.path.normpath(output_dir)
+
+            os.makedirs(doctree_dir, exist_ok=True)
+            os.makedirs(output_dir, exist_ok=True)
+
+            output_dirs.append(output_dir)
+
+            build_args = args + [
+                "-d", doctree_dir,
+                "-D", f"kerneldoc_bin={kerneldoc}",
+                "-D", f"version={self.kernelversion}",
+                "-D", f"release={self.kernelrelease}",
+                "-D", f"kerneldoc_srctree={self.srctree}",
+                src_dir,
+                output_dir,
+            ]
+
+            # Execute sphinx-build
+            try:
+                self.run_sphinx(sphinxbuild, build_args, env=self.env)
+            except Exception as e:
+                sys.exit(f"Build failed: {e}")
+
+            # Ensure that html/epub will have needed static files
+            if target in ["htmldocs", "epubdocs"]:
+                self.handle_html(css, output_dir)
+
+        # PDF and Info require a second build step
+        if target == "pdfdocs":
+            self.handle_pdf(output_dirs)
+        elif target == "infodocs":
+            self.handle_info(output_dirs)
+
+    @staticmethod
+    def get_python_version(cmd):
+        """
+        Get python version from a Python binary. As we need to detect if
+        are out there newer python binaries, we can't rely on sys.release here.
+        """
+
+        result = subprocess.run([cmd, "--version"], check=True,
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                universal_newlines=True)
+        version = result.stdout.strip()
+
+        match = re.search(r"(\d+\.\d+\.\d+)", version)
+        if match:
+            return parse_version(match.group(1))
+
+        print(f"Can't parse version {version}")
+        return (0, 0, 0)
+
+    @staticmethod
+    def find_python():
+        """
+        Detect if are out there any python 3.xy version newer than the
+        current one.
+
+        Note: this routine is limited to up to 2 digits for python3. We
+        may need to update it one day, hopefully on a distant future.
+        """
+        patterns = [
+            "python3.[0-9]",
+            "python3.[0-9][0-9]",
+        ]
+
+        # Seek for a python binary newer than MIN_PYTHON_VERSION
+        for path in os.getenv("PATH", "").split(":"):
+            for pattern in patterns:
+                for cmd in glob(os.path.join(path, pattern)):
+                    if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
+                        version = SphinxBuilder.get_python_version(cmd)
+                        if version >= MIN_PYTHON_VERSION:
+                            return cmd
+
+        return None
+
+    @staticmethod
+    def check_python():
+        """
+        Check if the current python binary satisfies our minimal requirement
+        for Sphinx build. If not, re-run with a newer version if found.
+        """
+        cur_ver = sys.version_info[:3]
+        if cur_ver >= MIN_PYTHON_VERSION:
+            return
+
+        python_ver = ver_str(cur_ver)
+
+        new_python_cmd = SphinxBuilder.find_python()
+        if not new_python_cmd:
+            sys.exit(f"Python version {python_ver} is not supported anymore.")
+
+        # Restart script using the newer version
+        script_path = os.path.abspath(sys.argv[0])
+        args = [new_python_cmd, script_path] + sys.argv[1:]
+
+        print(f"Python {python_ver} not supported. Changing to {new_python_cmd}")
+
+        try:
+            os.execv(new_python_cmd, args)
+        except OSError as e:
+            sys.exit(f"Failed to restart with {new_python_cmd}: {e}")
+
+def jobs_type(value):
+    """
+    Handle valid values for -j. Accepts Sphinx "-jauto", plus a number
+    equal or bigger than one.
+    """
+    if value is None:
+        return None
+
+    if value.lower() == 'auto':
+        return value.lower()
+
+    try:
+        if int(value) >= 1:
+            return value
+
+        raise argparse.ArgumentTypeError(f"Minimum jobs is 1, got {value}")
+    except ValueError:
+        raise argparse.ArgumentTypeError(f"Must be 'auto' or positive integer, got {value}")
+
+def main():
+    """
+    Main function. The only mandatory argument is the target. If not
+    specified, the other arguments will use default values if not
+    specified at os.environ.
+    """
+    parser = argparse.ArgumentParser(description="Kernel documentation builder")
+
+    parser.add_argument("target", choices=list(TARGETS.keys()),
+                        help="Documentation target to build")
+    parser.add_argument("--sphinxdirs", nargs="+",
+                        help="Specific directories to build")
+    parser.add_argument("--conf", default="conf.py",
+                        help="Sphinx configuration file")
+
+    parser.add_argument("--theme", help="Sphinx theme to use")
+
+    parser.add_argument("--css", help="Custom CSS file for HTML/EPUB")
+
+    parser.add_argument("--paper", choices=PAPER, default=PAPER[0],
+                        help="Paper size for LaTeX/PDF output")
+
+    parser.add_argument("-v", "--verbose", action='store_true',
+                        help="place build in verbose mode")
+
+    parser.add_argument('-j', '--jobs', type=jobs_type,
+                        help="Sets number of jobs to use with sphinx-build")
+
+    parser.add_argument('-i', '--interactive', action='store_true',
+                        help="Change latex default to run in interactive mode")
+
+    parser.add_argument("-V", "--venv", nargs='?', const=f'{VENV_DEFAULT}',
+                        default=None,
+                        help=f'If used, run Sphinx from a venv dir (default dir: {VENV_DEFAULT})')
+
+    args = parser.parse_args()
+
+    SphinxBuilder.check_python()
+
+    builder = SphinxBuilder(venv=args.venv, verbose=args.verbose,
+                            n_jobs=args.jobs, interactive=args.interactive)
+
+    builder.build(args.target, sphinxdirs=args.sphinxdirs, conf=args.conf,
+                  theme=args.theme, css=args.css, paper=args.paper)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install
index 3f8d6925e896..954ed3dc0645 100755
--- a/scripts/sphinx-pre-install
+++ b/scripts/sphinx-pre-install
@@ -1,1056 +1,1621 @@
-#!/usr/bin/env perl
+#!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later
-use strict;
-
-# Copyright (c) 2017-2020 Mauro Carvalho Chehab <mchehab@kernel.org>
-#
-
-my $prefix = "./";
-$prefix = "$ENV{'srctree'}/" if ($ENV{'srctree'});
-
-my $conf = $prefix . "Documentation/conf.py";
-my $requirement_file = $prefix . "Documentation/sphinx/requirements.txt";
-my $virtenv_prefix = "sphinx_";
-
-#
-# Static vars
-#
-
-my %missing;
-my $system_release;
-my $need = 0;
-my $optional = 0;
-my $need_symlink = 0;
-my $need_sphinx = 0;
-my $need_pip = 0;
-my $need_virtualenv = 0;
-my $rec_sphinx_upgrade = 0;
-my $verbose_warn_install = 1;
-my $install = "";
-my $virtenv_dir = "";
-my $python_cmd = "";
-my $activate_cmd;
-my $min_version;
-my $cur_version;
-my $rec_version = "3.4.3";
-my $latest_avail_ver;
-
-#
-# Command line arguments
-#
-
-my $pdf = 1;
-my $virtualenv = 1;
-my $version_check = 0;
-
-#
-# List of required texlive packages on Fedora and OpenSuse
-#
-
-my %texlive = (
-	'amsfonts.sty'       => 'texlive-amsfonts',
-	'amsmath.sty'        => 'texlive-amsmath',
-	'amssymb.sty'        => 'texlive-amsfonts',
-	'amsthm.sty'         => 'texlive-amscls',
-	'anyfontsize.sty'    => 'texlive-anyfontsize',
-	'atbegshi.sty'       => 'texlive-oberdiek',
-	'bm.sty'             => 'texlive-tools',
-	'capt-of.sty'        => 'texlive-capt-of',
-	'cmap.sty'           => 'texlive-cmap',
-	'ecrm1000.tfm'       => 'texlive-ec',
-	'eqparbox.sty'       => 'texlive-eqparbox',
-	'eu1enc.def'         => 'texlive-euenc',
-	'fancybox.sty'       => 'texlive-fancybox',
-	'fancyvrb.sty'       => 'texlive-fancyvrb',
-	'float.sty'          => 'texlive-float',
-	'fncychap.sty'       => 'texlive-fncychap',
-	'footnote.sty'       => 'texlive-mdwtools',
-	'framed.sty'         => 'texlive-framed',
-	'luatex85.sty'       => 'texlive-luatex85',
-	'multirow.sty'       => 'texlive-multirow',
-	'needspace.sty'      => 'texlive-needspace',
-	'palatino.sty'       => 'texlive-psnfss',
-	'parskip.sty'        => 'texlive-parskip',
-	'polyglossia.sty'    => 'texlive-polyglossia',
-	'tabulary.sty'       => 'texlive-tabulary',
-	'threeparttable.sty' => 'texlive-threeparttable',
-	'titlesec.sty'       => 'texlive-titlesec',
-	'ucs.sty'            => 'texlive-ucs',
-	'upquote.sty'        => 'texlive-upquote',
-	'wrapfig.sty'        => 'texlive-wrapfig',
-	'ctexhook.sty'       => 'texlive-ctex',
-);
-
-#
-# Subroutines that checks if a feature exists
-#
-
-sub check_missing(%)
-{
-	my %map = %{$_[0]};
-
-	foreach my $prog (sort keys %missing) {
-		my $is_optional = $missing{$prog};
-
-		# At least on some LTS distros like CentOS 7, texlive doesn't
-		# provide all packages we need. When such distros are
-		# detected, we have to disable PDF output.
-		#
-		# So, we need to ignore the packages that distros would
-		# need for LaTeX to work
-		if ($is_optional == 2 && !$pdf) {
-			$optional--;
-			next;
-		}
-
-		if ($verbose_warn_install) {
-			if ($is_optional) {
-				print "Warning: better to also install \"$prog\".\n";
-			} else {
-				print "ERROR: please install \"$prog\", otherwise, build won't work.\n";
-			}
-		}
-		if (defined($map{$prog})) {
-			$install .= " " . $map{$prog};
-		} else {
-			$install .= " " . $prog;
-		}
-	}
-
-	$install =~ s/^\s//;
-}
-
-sub add_package($$)
-{
-	my $package = shift;
-	my $is_optional = shift;
-
-	$missing{$package} = $is_optional;
-	if ($is_optional) {
-		$optional++;
-	} else {
-		$need++;
-	}
-}
-
-sub check_missing_file($$$)
-{
-	my $files = shift;
-	my $package = shift;
-	my $is_optional = shift;
-
-	for (@$files) {
-		return if(-e $_);
-	}
-
-	add_package($package, $is_optional);
-}
-
-sub findprog($)
-{
-	foreach(split(/:/, $ENV{PATH})) {
-		return "$_/$_[0]" if(-x "$_/$_[0]");
-	}
-}
-
-sub find_python_no_venv()
-{
-	my $prog = shift;
-
-	my $cur_dir = qx(pwd);
-	$cur_dir =~ s/\s+$//;
-
-	foreach my $dir (split(/:/, $ENV{PATH})) {
-		next if ($dir =~ m,($cur_dir)/sphinx,);
-		return "$dir/python3" if(-x "$dir/python3");
-	}
-	foreach my $dir (split(/:/, $ENV{PATH})) {
-		next if ($dir =~ m,($cur_dir)/sphinx,);
-		return "$dir/python" if(-x "$dir/python");
-	}
-	return "python";
-}
-
-sub check_program($$)
-{
-	my $prog = shift;
-	my $is_optional = shift;
-
-	return $prog if findprog($prog);
-
-	add_package($prog, $is_optional);
-}
-
-sub check_perl_module($$)
-{
-	my $prog = shift;
-	my $is_optional = shift;
-
-	my $err = system("perl -M$prog -e 1 2>/dev/null /dev/null");
-	return if ($err == 0);
-
-	add_package($prog, $is_optional);
-}
-
-sub check_python_module($$)
-{
-	my $prog = shift;
-	my $is_optional = shift;
-
-	return if (!$python_cmd);
-
-	my $err = system("$python_cmd -c 'import $prog' 2>/dev/null /dev/null");
-	return if ($err == 0);
-
-	add_package($prog, $is_optional);
-}
-
-sub check_rpm_missing($$)
-{
-	my @pkgs = @{$_[0]};
-	my $is_optional = $_[1];
-
-	foreach my $prog(@pkgs) {
-		my $err = system("rpm -q '$prog' 2>/dev/null >/dev/null");
-		add_package($prog, $is_optional) if ($err);
-	}
-}
-
-sub check_pacman_missing($$)
-{
-	my @pkgs = @{$_[0]};
-	my $is_optional = $_[1];
-
-	foreach my $prog(@pkgs) {
-		my $err = system("pacman -Q '$prog' 2>/dev/null >/dev/null");
-		add_package($prog, $is_optional) if ($err);
-	}
-}
-
-sub check_missing_tex($)
-{
-	my $is_optional = shift;
-	my $kpsewhich = findprog("kpsewhich");
-
-	foreach my $prog(keys %texlive) {
-		my $package = $texlive{$prog};
-		if (!$kpsewhich) {
-			add_package($package, $is_optional);
-			next;
-		}
-		my $file = qx($kpsewhich $prog);
-		add_package($package, $is_optional) if ($file =~ /^\s*$/);
-	}
-}
-
-sub get_sphinx_fname()
-{
-	if ($ENV{'SPHINXBUILD'}) {
-	    return $ENV{'SPHINXBUILD'};
-	}
-
-	my $fname = "sphinx-build";
-	return $fname if findprog($fname);
-
-	$fname = "sphinx-build-3";
-	if (findprog($fname)) {
-		$need_symlink = 1;
-		return $fname;
-	}
-
-	return "";
-}
-
-sub get_sphinx_version($)
-{
-	my $cmd = shift;
-	my $ver;
-
-	open IN, "$cmd --version 2>&1 |";
-	while (<IN>) {
-		if (m/^\s*sphinx-build\s+([\d\.]+)((\+\/[\da-f]+)|(b\d+))?$/) {
-			$ver=$1;
-			last;
-		}
-		# Sphinx 1.2.x uses a different format
-		if (m/^\s*Sphinx.*\s+([\d\.]+)$/) {
-			$ver=$1;
-			last;
-		}
-	}
-	close IN;
-	return $ver;
-}
-
-sub check_sphinx()
-{
-	open IN, $conf or die "Can't open $conf";
-	while (<IN>) {
-		if (m/^\s*needs_sphinx\s*=\s*[\'\"]([\d\.]+)[\'\"]/) {
-			$min_version=$1;
-			last;
-		}
-	}
-	close IN;
-
-	die "Can't get needs_sphinx version from $conf" if (!$min_version);
-
-	$virtenv_dir = $virtenv_prefix . "latest";
-
-	my $sphinx = get_sphinx_fname();
-	if ($sphinx eq "") {
-		$need_sphinx = 1;
-		return;
-	}
-
-	$cur_version = get_sphinx_version($sphinx);
-	die "$sphinx didn't return its version" if (!$cur_version);
-
-	if ($cur_version lt $min_version) {
-		printf "ERROR: Sphinx version is %s. It should be >= %s\n",
-		       $cur_version, $min_version;
-		$need_sphinx = 1;
-		return;
-	}
-
-	return if ($cur_version lt $rec_version);
-
-	# On version check mode, just assume Sphinx has all mandatory deps
-	exit (0) if ($version_check);
-}
-
-#
-# Ancillary subroutines
-#
-
-sub catcheck($)
-{
-  my $res = "";
-  $res = qx(cat $_[0]) if (-r $_[0]);
-  return $res;
-}
-
-sub which($)
-{
-	my $file = shift;
-	my @path = split ":", $ENV{PATH};
-
-	foreach my $dir(@path) {
-		my $name = $dir.'/'.$file;
-		return $name if (-x $name );
-	}
-	return undef;
-}
-
-#
-# Subroutines that check distro-specific hints
-#
-
-sub give_debian_hints()
-{
-	my %map = (
-		"python-sphinx"		=> "python3-sphinx",
-		"yaml"			=> "python3-yaml",
-		"ensurepip"		=> "python3-venv",
-		"virtualenv"		=> "virtualenv",
-		"dot"			=> "graphviz",
-		"convert"		=> "imagemagick",
-		"Pod::Usage"		=> "perl-modules",
-		"xelatex"		=> "texlive-xetex",
-		"rsvg-convert"		=> "librsvg2-bin",
-	);
-
-	if ($pdf) {
-		check_missing_file(["/usr/share/texlive/texmf-dist/tex/latex/ctex/ctexhook.sty"],
-				   "texlive-lang-chinese", 2);
-
-		check_missing_file(["/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"],
-				   "fonts-dejavu", 2);
-
-		check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc",
-				    "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
-				    "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc"],
-				   "fonts-noto-cjk", 2);
-	}
-
-	check_program("dvipng", 2) if ($pdf);
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-	printf("You should run:\n") if ($verbose_warn_install);
-	printf("\n\tsudo apt-get install $install\n");
-}
-
-sub give_redhat_hints()
-{
-	my %map = (
-		"python-sphinx"		=> "python3-sphinx",
-		"yaml"			=> "python3-pyyaml",
-		"virtualenv"		=> "python3-virtualenv",
-		"dot"			=> "graphviz",
-		"convert"		=> "ImageMagick",
-		"Pod::Usage"		=> "perl-Pod-Usage",
-		"xelatex"		=> "texlive-xetex-bin",
-		"rsvg-convert"		=> "librsvg2-tools",
-	);
-
-	my @fedora26_opt_pkgs = (
-		"graphviz-gd",		# Fedora 26: needed for PDF support
-	);
-
-	my @fedora_tex_pkgs = (
-		"texlive-collection-fontsrecommended",
-		"texlive-collection-latex",
-		"texlive-xecjk",
-		"dejavu-sans-fonts",
-		"dejavu-serif-fonts",
-		"dejavu-sans-mono-fonts",
-	);
-
-	#
-	# Checks valid for RHEL/CentOS version 7.x.
-	#
-	my $old = 0;
-	my $rel;
-	my $noto_sans_redhat = "google-noto-sans-cjk-ttc-fonts";
-	$rel = $1 if ($system_release =~ /(release|Linux)\s+(\d+)/);
-
-	if (!($system_release =~ /Fedora/)) {
-		$map{"virtualenv"} = "python-virtualenv";
-
-		if ($rel && $rel < 8) {
-			$old = 1;
-			$pdf = 0;
-
-			printf("Note: texlive packages on RHEL/CENTOS <= 7 are incomplete. Can't support PDF output\n");
-			printf("If you want to build PDF, please read:\n");
-			printf("\thttps://www.systutorials.com/241660/how-to-install-tex-live-on-centos-7-linux/\n");
-		}
-	} else {
-		if ($rel && $rel < 26) {
-			$old = 1;
-		}
-		if ($rel && $rel >= 38) {
-			$noto_sans_redhat = "google-noto-sans-cjk-fonts";
-		}
-	}
-	if (!$rel) {
-		printf("Couldn't identify release number\n");
-		$old = 1;
-		$pdf = 0;
-	}
-
-	if ($pdf) {
-		check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
-				    "/usr/share/fonts/google-noto-sans-cjk-fonts/NotoSansCJK-Regular.ttc"],
-				   $noto_sans_redhat, 2);
-	}
-
-	check_rpm_missing(\@fedora26_opt_pkgs, 2) if ($pdf && !$old);
-	check_rpm_missing(\@fedora_tex_pkgs, 2) if ($pdf);
-	check_missing_tex(2) if ($pdf);
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-
-	if (!$old) {
-		# dnf, for Fedora 18+
-		printf("You should run:\n") if ($verbose_warn_install);
-		printf("\n\tsudo dnf install -y $install\n");
-	} else {
-		# yum, for RHEL (and clones) or Fedora version < 18
-		printf("You should run:\n") if ($verbose_warn_install);
-		printf("\n\tsudo yum install -y $install\n");
-	}
-}
-
-sub give_opensuse_hints()
-{
-	my %map = (
-		"python-sphinx"		=> "python3-sphinx",
-		"yaml"			=> "python3-pyyaml",
-		"virtualenv"		=> "python3-virtualenv",
-		"dot"			=> "graphviz",
-		"convert"		=> "ImageMagick",
-		"Pod::Usage"		=> "perl-Pod-Usage",
-		"xelatex"		=> "texlive-xetex-bin",
-	);
-
-	# On Tumbleweed, this package is also named rsvg-convert
-	$map{"rsvg-convert"} = "rsvg-view" if (!($system_release =~ /Tumbleweed/));
-
-	my @suse_tex_pkgs = (
-		"texlive-babel-english",
-		"texlive-caption",
-		"texlive-colortbl",
-		"texlive-courier",
-		"texlive-dvips",
-		"texlive-helvetic",
-		"texlive-makeindex",
-		"texlive-metafont",
-		"texlive-metapost",
-		"texlive-palatino",
-		"texlive-preview",
-		"texlive-times",
-		"texlive-zapfchan",
-		"texlive-zapfding",
-	);
-
-	$map{"latexmk"} = "texlive-latexmk-bin";
-
-	# FIXME: add support for installing CJK fonts
-	#
-	# I tried hard, but was unable to find a way to install
-	# "Noto Sans CJK SC" on openSUSE
-
-	check_rpm_missing(\@suse_tex_pkgs, 2) if ($pdf);
-	check_missing_tex(2) if ($pdf);
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-	printf("You should run:\n") if ($verbose_warn_install);
-	printf("\n\tsudo zypper install --no-recommends $install\n");
-}
-
-sub give_mageia_hints()
-{
-	my %map = (
-		"python-sphinx"		=> "python3-sphinx",
-		"yaml"			=> "python3-yaml",
-		"virtualenv"		=> "python3-virtualenv",
-		"dot"			=> "graphviz",
-		"convert"		=> "ImageMagick",
-		"Pod::Usage"		=> "perl-Pod-Usage",
-		"xelatex"		=> "texlive",
-		"rsvg-convert"		=> "librsvg2",
-	);
-
-	my @tex_pkgs = (
-		"texlive-fontsextra",
-	);
-
-	$map{"latexmk"} = "texlive-collection-basic";
-
-	my $packager_cmd;
-	my $noto_sans;
-	if ($system_release =~ /OpenMandriva/) {
-		$packager_cmd = "dnf install";
-		$noto_sans = "noto-sans-cjk-fonts";
-		@tex_pkgs = ( "texlive-collection-fontsextra" );
-	} else {
-		$packager_cmd = "urpmi";
-		$noto_sans = "google-noto-sans-cjk-ttc-fonts";
-	}
-
-
-	if ($pdf) {
-		check_missing_file(["/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
-				    "/usr/share/fonts/TTF/NotoSans-Regular.ttf"],
-				   $noto_sans, 2);
-	}
-
-	check_rpm_missing(\@tex_pkgs, 2) if ($pdf);
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-	printf("You should run:\n") if ($verbose_warn_install);
-	printf("\n\tsudo $packager_cmd $install\n");
-}
-
-sub give_arch_linux_hints()
-{
-	my %map = (
-		"yaml"			=> "python-yaml",
-		"virtualenv"		=> "python-virtualenv",
-		"dot"			=> "graphviz",
-		"convert"		=> "imagemagick",
-		"xelatex"		=> "texlive-xetex",
-		"latexmk"		=> "texlive-core",
-		"rsvg-convert"		=> "extra/librsvg",
-	);
-
-	my @archlinux_tex_pkgs = (
-		"texlive-core",
-		"texlive-latexextra",
-		"ttf-dejavu",
-	);
-	check_pacman_missing(\@archlinux_tex_pkgs, 2) if ($pdf);
-
-	if ($pdf) {
-		check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"],
-				   "noto-fonts-cjk", 2);
-	}
-
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-	printf("You should run:\n") if ($verbose_warn_install);
-	printf("\n\tsudo pacman -S $install\n");
-}
-
-sub give_gentoo_hints()
-{
-	my %map = (
-		"yaml"			=> "dev-python/pyyaml",
-		"virtualenv"		=> "dev-python/virtualenv",
-		"dot"			=> "media-gfx/graphviz",
-		"convert"		=> "media-gfx/imagemagick",
-		"xelatex"		=> "dev-texlive/texlive-xetex media-fonts/dejavu",
-		"rsvg-convert"		=> "gnome-base/librsvg",
-	);
-
-	check_missing_file(["/usr/share/fonts/dejavu/DejaVuSans.ttf"],
-			   "media-fonts/dejavu", 2) if ($pdf);
-
-	if ($pdf) {
-		check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf",
-				    "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc"],
-				   "media-fonts/noto-cjk", 2);
-	}
-
-	check_missing(\%map);
-
-	return if (!$need && !$optional);
-
-	printf("You should run:\n") if ($verbose_warn_install);
-	printf("\n");
-
-	my $imagemagick = "media-gfx/imagemagick svg png";
-	my $cairo = "media-gfx/graphviz cairo pdf";
-	my $portage_imagemagick = "/etc/portage/package.use/imagemagick";
-	my $portage_cairo = "/etc/portage/package.use/graphviz";
-
-	if (qx(grep imagemagick $portage_imagemagick 2>/dev/null) eq "") {
-		printf("\tsudo su -c 'echo \"$imagemagick\" > $portage_imagemagick'\n")
-	}
-	if (qx(grep graphviz $portage_cairo 2>/dev/null) eq  "") {
-		printf("\tsudo su -c 'echo \"$cairo\" > $portage_cairo'\n");
-	}
-
-	printf("\tsudo emerge --ask $install\n");
-
-}
-
-sub check_distros()
-{
-	# Distro-specific hints
-	if ($system_release =~ /Red Hat Enterprise Linux/) {
-		give_redhat_hints;
-		return;
-	}
-	if ($system_release =~ /CentOS/) {
-		give_redhat_hints;
-		return;
-	}
-	if ($system_release =~ /Scientific Linux/) {
-		give_redhat_hints;
-		return;
-	}
-	if ($system_release =~ /Oracle Linux Server/) {
-		give_redhat_hints;
-		return;
-	}
-	if ($system_release =~ /Fedora/) {
-		give_redhat_hints;
-		return;
-	}
-	if ($system_release =~ /Ubuntu/) {
-		give_debian_hints;
-		return;
-	}
-	if ($system_release =~ /Debian/) {
-		give_debian_hints;
-		return;
-	}
-	if ($system_release =~ /openSUSE/) {
-		give_opensuse_hints;
-		return;
-	}
-	if ($system_release =~ /Mageia/) {
-		give_mageia_hints;
-		return;
-	}
-	if ($system_release =~ /OpenMandriva/) {
-		give_mageia_hints;
-		return;
-	}
-	if ($system_release =~ /Arch Linux/) {
-		give_arch_linux_hints;
-		return;
-	}
-	if ($system_release =~ /Gentoo/) {
-		give_gentoo_hints;
-		return;
-	}
-
-	#
-	# Fall-back to generic hint code for other distros
-	# That's far from ideal, specially for LaTeX dependencies.
-	#
-	my %map = (
-		"sphinx-build" => "sphinx"
-	);
-	check_missing_tex(2) if ($pdf);
-	check_missing(\%map);
-	print "I don't know distro $system_release.\n";
-	print "So, I can't provide you a hint with the install procedure.\n";
-	print "There are likely missing dependencies.\n";
-}
-
-#
-# Common dependencies
+# Copyright (c) 2017-2025 Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
 #
+# pylint: disable=C0103,C0114,C0115,C0116,C0301,C0302
+# pylint: disable=R0902,R0904,R0911,R0912,R0914,R0915,R1705,R1710,E1121
+
+# Note: this script requires at least Python 3.6 to run.
+# Don't add changes not compatible with it, it is meant to report
+# incompatible python versions.
+
+"""
+Dependency checker for Sphinx documentation Kernel build.
+
+This module provides tools to check for all required dependencies needed to
+build documentation using Sphinx, including system packages, Python modules
+and LaTeX packages for PDF generation.
+
+It detect packages for a subset of Linux distributions used by Kernel
+maintainers, showing hints and missing dependencies.
+
+The main class SphinxDependencyChecker handles the dependency checking logic
+and provides recommendations for installing missing packages. It supports both
+system package installations and  Python virtual environments. By default,
+system pacage install is recommended.
+"""
+
+import argparse
+import os
+import re
+import subprocess
+import sys
+from glob import glob
+
+
+def parse_version(version):
+    """Convert a major.minor.patch version into a tuple"""
+    return tuple(int(x) for x in version.split("."))
+
+
+def ver_str(version):
+    """Returns a version tuple as major.minor.patch"""
+
+    return ".".join([str(x) for x in version])
+
+
+RECOMMENDED_VERSION = parse_version("3.4.3")
+MIN_PYTHON_VERSION = parse_version("3.7")
+
+
+class DepManager:
+    """
+    Manage package dependencies. There are three types of dependencies:
+
+    - System: dependencies required for docs build;
+    - Python: python dependencies for a native distro Sphinx install;
+    - PDF: dependencies needed by PDF builds.
+
+    Each dependency can be mandatory or optional. Not installing an optional
+    dependency won't break the build, but will cause degradation at the
+    docs output.
+    """
+
+    # Internal types of dependencies. Don't use them outside DepManager class.
+    _SYS_TYPE = 0
+    _PHY_TYPE = 1
+    _PDF_TYPE = 2
+
+    # Dependencies visible outside the class.
+    # The keys are tuple with: (type, is_mandatory flag).
+    #
+    # Currently we're not using all optional dep types. Yet, we'll keep all
+    # possible combinations here. They're not many, and that makes easier
+    # if later needed and for the name() method below
+
+    SYSTEM_MANDATORY = (_SYS_TYPE, True)
+    PYTHON_MANDATORY = (_PHY_TYPE, True)
+    PDF_MANDATORY = (_PDF_TYPE, True)
+
+    SYSTEM_OPTIONAL = (_SYS_TYPE, False)
+    PYTHON_OPTIONAL = (_PHY_TYPE, False)
+    PDF_OPTIONAL = (_PDF_TYPE, True)
+
+    def __init__(self, pdf):
+        """
+        Initialize internal vars:
+
+        - missing: missing dependencies list, containing a distro-independent
+                   name for a missing dependency and its type.
+        - missing_pkg: ancillary dict containing missing dependencies in
+                       distro namespace, organized by type.
+        - need: total number of needed dependencies. Never cleaned.
+        - optional: total number of optional dependencies. Never cleaned.
+        - pdf: Is PDF support enabled?
+        """
+        self.missing = {}
+        self.missing_pkg = {}
+        self.need = 0
+        self.optional = 0
+        self.pdf = pdf
+
+    @staticmethod
+    def name(dtype):
+        """
+        Ancillary routine to output a warn/error message reporting
+        missing dependencies.
+        """
+        if dtype[0] == DepManager._SYS_TYPE:
+            msg = "build"
+        elif dtype[0] == DepManager._PHY_TYPE:
+            msg = "Python"
+        else:
+            msg = "PDF"
+
+        if dtype[1]:
+            return f"ERROR: {msg} mandatory deps missing"
+        else:
+            return f"Warning: {msg} optional deps missing"
+
+    @staticmethod
+    def is_optional(dtype):
+        """Ancillary routine to report if a dependency is optional"""
+        return not dtype[1]
+
+    @staticmethod
+    def is_pdf(dtype):
+        """Ancillary routine to report if a dependency is for PDF generation"""
+        if dtype[0] == DepManager._PDF_TYPE:
+            return True
+
+        return False
+
+    def add_package(self, package, dtype):
+        """
+        Add a package at the self.missing() dictionary.
+        Doesn't update missing_pkg.
+        """
+        is_optional = DepManager.is_optional(dtype)
+        self.missing[package] = dtype
+        if is_optional:
+            self.optional += 1
+        else:
+            self.need += 1
+
+    def del_package(self, package):
+        """
+        Remove a package at the self.missing() dictionary.
+        Doesn't update missing_pkg.
+        """
+        if package in self.missing:
+            del self.missing[package]
+
+    def clear_deps(self):
+        """
+        Clear dependencies without changing needed/optional.
+
+        This is an ackward way to have a separate section to recommend
+        a package after system main dependencies.
+
+        TODO: rework the logic to prevent needing it.
+        """
+
+        self.missing = {}
+        self.missing_pkg = {}
+
+    def check_missing(self, progs):
+        """
+        Update self.missing_pkg, using progs dict to convert from the
+        agnostic package name to distro-specific one.
+
+        Returns an string with the packages to be installed, sorted and
+        with eventual duplicates removed.
+        """
+
+        self.missing_pkg = {}
+
+        for prog, dtype in sorted(self.missing.items()):
+            # At least on some LTS distros like CentOS 7, texlive doesn't
+            # provide all packages we need. When such distros are
+            # detected, we have to disable PDF output.
+            #
+            # So, we need to ignore the packages that distros would
+            # need for LaTeX to work
+            if DepManager.is_pdf(dtype) and not self.pdf:
+                self.optional -= 1
+                continue
+
+            if not dtype in self.missing_pkg:
+                self.missing_pkg[dtype] = []
+
+            self.missing_pkg[dtype].append(progs.get(prog, prog))
+
+        install = []
+        for dtype, pkgs in self.missing_pkg.items():
+            install += pkgs
+
+        return " ".join(sorted(set(install)))
+
+    def warn_install(self):
+        """
+        Emit warnings/errors related to missing packages.
+        """
+
+        output_msg = ""
+
+        for dtype in sorted(self.missing_pkg.keys()):
+            progs = " ".join(sorted(set(self.missing_pkg[dtype])))
+
+            try:
+                name = DepManager.name(dtype)
+                output_msg += f'{name}:\t{progs}\n'
+            except KeyError:
+                raise KeyError(f"ERROR!!!: invalid dtype for {progs}: {dtype}")
+
+        if output_msg:
+            print(f"\n{output_msg}")
+
+class AncillaryMethods:
+    """
+    Ancillary methods that checks for missing dependencies for different
+    types of types, like binaries, python modules, rpm deps, etc.
+    """
+
+    @staticmethod
+    def which(prog):
+        """
+        Our own implementation of which(). We could instead use
+        shutil.which(), but this function is simple enough.
+        Probably faster to use this implementation than to import shutil.
+        """
+        for path in os.environ.get("PATH", "").split(":"):
+            full_path = os.path.join(path, prog)
+            if os.access(full_path, os.X_OK):
+                return full_path
+
+        return None
+
+    @staticmethod
+    def get_python_version(cmd):
+        """
+        Get python version from a Python binary. As we need to detect if
+        are out there newer python binaries, we can't rely on sys.release here.
+        """
+
+        result = SphinxDependencyChecker.run([cmd, "--version"],
+                                            capture_output=True, text=True)
+        version = result.stdout.strip()
+
+        match = re.search(r"(\d+\.\d+\.\d+)", version)
+        if match:
+            return parse_version(match.group(1))
+
+        print(f"Can't parse version {version}")
+        return (0, 0, 0)
+
+    @staticmethod
+    def find_python():
+        """
+        Detect if are out there any python 3.xy version newer than the
+        current one.
+
+        Note: this routine is limited to up to 2 digits for python3. We
+        may need to update it one day, hopefully on a distant future.
+        """
+        patterns = [
+            "python3.[0-9]",
+            "python3.[0-9][0-9]",
+        ]
+
+        # Seek for a python binary newer than MIN_PYTHON_VERSION
+        for path in os.getenv("PATH", "").split(":"):
+            for pattern in patterns:
+                for cmd in glob(os.path.join(path, pattern)):
+                    if os.path.isfile(cmd) and os.access(cmd, os.X_OK):
+                        version = SphinxDependencyChecker.get_python_version(cmd)
+                        if version >= MIN_PYTHON_VERSION:
+                            return cmd
+
+    @staticmethod
+    def check_python():
+        """
+        Check if the current python binary satisfies our minimal requirement
+        for Sphinx build. If not, re-run with a newer version if found.
+        """
+        cur_ver = sys.version_info[:3]
+        if cur_ver >= MIN_PYTHON_VERSION:
+            ver = ver_str(cur_ver)
+            print(f"Python version: {ver}")
+
+            # This could be useful for debugging purposes
+            if SphinxDependencyChecker.which("docutils"):
+                result = SphinxDependencyChecker.run(["docutils", "--version"],
+                                                    capture_output=True, text=True)
+                ver = result.stdout.strip()
+                match = re.search(r"(\d+\.\d+\.\d+)", ver)
+                if match:
+                    ver = match.group(1)
+
+                print(f"Docutils version: {ver}")
+
+            return
+
+        python_ver = ver_str(cur_ver)
+
+        new_python_cmd = SphinxDependencyChecker.find_python()
+        if not new_python_cmd:
+            print(f"ERROR: Python version {python_ver} is not spported anymore\n")
+            print("       Can't find a new version. This script may fail")
+            return
+
+        # Restart script using the newer version
+        script_path = os.path.abspath(sys.argv[0])
+        args = [new_python_cmd, script_path] + sys.argv[1:]
+
+        print(f"Python {python_ver} not supported. Changing to {new_python_cmd}")
+
+        try:
+            os.execv(new_python_cmd, args)
+        except OSError as e:
+            sys.exit(f"Failed to restart with {new_python_cmd}: {e}")
+
+    @staticmethod
+    def run(*args, **kwargs):
+        """
+        Excecute a command, hiding its output by default.
+        Preserve comatibility with older Python versions.
+        """
+
+        capture_output = kwargs.pop('capture_output', False)
+
+        if capture_output:
+            if 'stdout' not in kwargs:
+                kwargs['stdout'] = subprocess.PIPE
+            if 'stderr' not in kwargs:
+                kwargs['stderr'] = subprocess.PIPE
+        else:
+            if 'stdout' not in kwargs:
+                kwargs['stdout'] = subprocess.DEVNULL
+            if 'stderr' not in kwargs:
+                kwargs['stderr'] = subprocess.DEVNULL
+
+        # Don't break with older Python versions
+        if 'text' in kwargs and sys.version_info < (3, 7):
+            kwargs['universal_newlines'] = kwargs.pop('text')
+
+        return subprocess.run(*args, **kwargs)
+
+class MissingCheckers(AncillaryMethods):
+    """
+    Contains some ancillary checkers for different types of binaries and
+    package managers.
+    """
+
+    def __init__(self, args, texlive):
+        """
+        Initialize its internal variables
+        """
+        self.pdf = args.pdf
+        self.virtualenv = args.virtualenv
+        self.version_check = args.version_check
+        self.texlive = texlive
+
+        self.min_version = (0, 0, 0)
+        self.cur_version = (0, 0, 0)
+
+        self.deps = DepManager(self.pdf)
+
+        self.need_symlink = 0
+        self.need_sphinx = 0
+
+        self.verbose_warn_install = 1
+
+        self.virtenv_dir = ""
+        self.install = ""
+        self.python_cmd = ""
+
+        self.virtenv_prefix = ["sphinx_", "Sphinx_" ]
+
+    def check_missing_file(self, files, package, dtype):
+        """
+        Does the file exists? If not, add it to missing dependencies.
+        """
+        for f in files:
+            if os.path.exists(f):
+                return
+        self.deps.add_package(package, dtype)
+
+    def check_program(self, prog, dtype):
+        """
+        Does the program exists and it is at the PATH?
+        If not, add it to missing dependencies.
+        """
+        found = self.which(prog)
+        if found:
+            return found
+
+        self.deps.add_package(prog, dtype)
+
+        return None
+
+    def check_perl_module(self, prog, dtype):
+        """
+        Does perl have a dependency? Is it available?
+        If not, add it to missing dependencies.
+
+        Right now, we still need Perl for doc build, as it is required
+        by some tools called at docs or kernel build time, like:
+
+            scripts/documentation-file-ref-check
+
+        Also, checkpatch is on Perl.
+        """
+
+        # While testing with lxc download template, one of the
+        # distros (Oracle) didn't have perl - nor even an option to install
+        # before installing oraclelinux-release-el9 package.
+        #
+        # Check it before running an error. If perl is not there,
+        # add it as a mandatory package, as some parts of the doc builder
+        # needs it.
+        if not self.which("perl"):
+            self.deps.add_package("perl", DepManager.SYSTEM_MANDATORY)
+            self.deps.add_package(prog, dtype)
+            return
+
+        try:
+            self.run(["perl", f"-M{prog}", "-e", "1"], check=True)
+        except subprocess.CalledProcessError:
+            self.deps.add_package(prog, dtype)
+
+    def check_python_module(self, module, is_optional=False):
+        """
+        Does a python module exists outside venv? If not, add it to missing
+        dependencies.
+        """
+        if is_optional:
+            dtype = DepManager.PYTHON_OPTIONAL
+        else:
+            dtype = DepManager.PYTHON_MANDATORY
+
+        try:
+            self.run([self.python_cmd, "-c", f"import {module}"], check=True)
+        except subprocess.CalledProcessError:
+            self.deps.add_package(module, dtype)
+
+    def check_rpm_missing(self, pkgs, dtype):
+        """
+        Does a rpm package exists? If not, add it to missing dependencies.
+        """
+        for prog in pkgs:
+            try:
+                self.run(["rpm", "-q", prog], check=True)
+            except subprocess.CalledProcessError:
+                self.deps.add_package(prog, dtype)
+
+    def check_pacman_missing(self, pkgs, dtype):
+        """
+        Does a pacman package exists? If not, add it to missing dependencies.
+        """
+        for prog in pkgs:
+            try:
+                self.run(["pacman", "-Q", prog], check=True)
+            except subprocess.CalledProcessError:
+                self.deps.add_package(prog, dtype)
+
+    def check_missing_tex(self, is_optional=False):
+        """
+        Does a LaTeX package exists? If not, add it to missing dependencies.
+        """
+        if is_optional:
+            dtype = DepManager.PDF_OPTIONAL
+        else:
+            dtype = DepManager.PDF_MANDATORY
+
+        kpsewhich = self.which("kpsewhich")
+        for prog, package in self.texlive.items():
+
+            # If kpsewhich is not there, just add it to deps
+            if not kpsewhich:
+                self.deps.add_package(package, dtype)
+                continue
+
+            # Check if the package is needed
+            try:
+                result = self.run(
+                    [kpsewhich, prog], stdout=subprocess.PIPE, text=True, check=True
+                )
+
+                # Didn't find. Add it
+                if not result.stdout.strip():
+                    self.deps.add_package(package, dtype)
+
+            except subprocess.CalledProcessError:
+                # kpsewhich returned an error. Add it, just in case
+                self.deps.add_package(package, dtype)
+
+    def get_sphinx_fname(self):
+        """
+        Gets the binary filename for sphinx-build.
+        """
+        if "SPHINXBUILD" in os.environ:
+            return os.environ["SPHINXBUILD"]
+
+        fname = "sphinx-build"
+        if self.which(fname):
+            return fname
+
+        fname = "sphinx-build-3"
+        if self.which(fname):
+            self.need_symlink = 1
+            return fname
+
+        return ""
+
+    def get_sphinx_version(self, cmd):
+        """
+        Gets sphinx-build version.
+        """
+        try:
+            result = self.run([cmd, "--version"],
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.STDOUT,
+                              text=True, check=True)
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            return None
+
+        for line in result.stdout.split("\n"):
+            match = re.match(r"^sphinx-build\s+([\d\.]+)(?:\+(?:/[\da-f]+)|b\d+)?\s*$", line)
+            if match:
+                return parse_version(match.group(1))
+
+            match = re.match(r"^Sphinx.*\s+([\d\.]+)\s*$", line)
+            if match:
+                return parse_version(match.group(1))
+
+    def check_sphinx(self, conf):
+        """
+        Checks Sphinx minimal requirements
+        """
+        try:
+            with open(conf, "r", encoding="utf-8") as f:
+                for line in f:
+                    match = re.match(r"^\s*needs_sphinx\s*=\s*[\'\"]([\d\.]+)[\'\"]", line)
+                    if match:
+                        self.min_version = parse_version(match.group(1))
+                        break
+        except IOError:
+            sys.exit(f"Can't open {conf}")
+
+        if not self.min_version:
+            sys.exit(f"Can't get needs_sphinx version from {conf}")
+
+        self.virtenv_dir = self.virtenv_prefix[0] + "latest"
+
+        sphinx = self.get_sphinx_fname()
+        if not sphinx:
+            self.need_sphinx = 1
+            return
+
+        self.cur_version = self.get_sphinx_version(sphinx)
+        if not self.cur_version:
+            sys.exit(f"{sphinx} didn't return its version")
+
+        if self.cur_version < self.min_version:
+            curver = ver_str(self.cur_version)
+            minver = ver_str(self.min_version)
+
+            print(f"ERROR: Sphinx version is {curver}. It should be >= {minver}")
+            self.need_sphinx = 1
+            return
+
+        # On version check mode, just assume Sphinx has all mandatory deps
+        if self.version_check and self.cur_version >= RECOMMENDED_VERSION:
+            sys.exit(0)
+
+    def catcheck(self, filename):
+        """
+        Reads a file if it exists, returning as string.
+        If not found, returns an empty string.
+        """
+        if os.path.exists(filename):
+            with open(filename, "r", encoding="utf-8") as f:
+                return f.read().strip()
+        return ""
+
+    def get_system_release(self):
+        """
+        Determine the system type. There's no unique way that would work
+        with all distros with a minimal package install. So, several
+        methods are used here.
+
+        By default, it will use lsb_release function. If not available, it will
+        fail back to reading the known different places where the distro name
+        is stored.
+
+        Several modern distros now have /etc/os-release, which usually have
+        a decent coverage.
+        """
+
+        system_release = ""
+
+        if self.which("lsb_release"):
+            result = self.run(["lsb_release", "-d"], capture_output=True, text=True)
+            system_release = result.stdout.replace("Description:", "").strip()
+
+        release_files = [
+            "/etc/system-release",
+            "/etc/redhat-release",
+            "/etc/lsb-release",
+            "/etc/gentoo-release",
+        ]
+
+        if not system_release:
+            for f in release_files:
+                system_release = self.catcheck(f)
+                if system_release:
+                    break
+
+        # This seems more common than LSB these days
+        if not system_release:
+            os_var = {}
+            try:
+                with open("/etc/os-release", "r", encoding="utf-8") as f:
+                    for line in f:
+                        match = re.match(r"^([\w\d\_]+)=\"?([^\"]*)\"?\n", line)
+                        if match:
+                            os_var[match.group(1)] = match.group(2)
+
+                system_release = os_var.get("NAME", "")
+                if "VERSION_ID" in os_var:
+                    system_release += " " + os_var["VERSION_ID"]
+                elif "VERSION" in os_var:
+                    system_release += " " + os_var["VERSION"]
+            except IOError:
+                pass
+
+        if not system_release:
+            system_release = self.catcheck("/etc/issue")
+
+        system_release = system_release.strip()
+
+        return system_release
+
+class SphinxDependencyChecker(MissingCheckers):
+    """
+    Main class for checking Sphinx documentation build dependencies.
+
+    - Check for missing system packages;
+    - Check for missing Python modules;
+    - Check for missing LaTeX packages needed by PDF generation;
+    - Propose Sphinx install via Python Virtual environment;
+    - Propose Sphinx install via distro-specific package install.
+    """
+    def __init__(self, args):
+        """Initialize checker variables"""
+
+        # List of required texlive packages on Fedora and OpenSuse
+        texlive = {
+            "amsfonts.sty":       "texlive-amsfonts",
+            "amsmath.sty":        "texlive-amsmath",
+            "amssymb.sty":        "texlive-amsfonts",
+            "amsthm.sty":         "texlive-amscls",
+            "anyfontsize.sty":    "texlive-anyfontsize",
+            "atbegshi.sty":       "texlive-oberdiek",
+            "bm.sty":             "texlive-tools",
+            "capt-of.sty":        "texlive-capt-of",
+            "cmap.sty":           "texlive-cmap",
+            "ctexhook.sty":       "texlive-ctex",
+            "ecrm1000.tfm":       "texlive-ec",
+            "eqparbox.sty":       "texlive-eqparbox",
+            "eu1enc.def":         "texlive-euenc",
+            "fancybox.sty":       "texlive-fancybox",
+            "fancyvrb.sty":       "texlive-fancyvrb",
+            "float.sty":          "texlive-float",
+            "fncychap.sty":       "texlive-fncychap",
+            "footnote.sty":       "texlive-mdwtools",
+            "framed.sty":         "texlive-framed",
+            "luatex85.sty":       "texlive-luatex85",
+            "multirow.sty":       "texlive-multirow",
+            "needspace.sty":      "texlive-needspace",
+            "palatino.sty":       "texlive-psnfss",
+            "parskip.sty":        "texlive-parskip",
+            "polyglossia.sty":    "texlive-polyglossia",
+            "tabulary.sty":       "texlive-tabulary",
+            "threeparttable.sty": "texlive-threeparttable",
+            "titlesec.sty":       "texlive-titlesec",
+            "ucs.sty":            "texlive-ucs",
+            "upquote.sty":        "texlive-upquote",
+            "wrapfig.sty":        "texlive-wrapfig",
+        }
+
+        super().__init__(args, texlive)
+
+        self.need_pip = False
+        self.rec_sphinx_upgrade = 0
+
+        self.system_release = self.get_system_release()
+        self.activate_cmd = ""
+
+        # Some distros may not have a Sphinx shipped package compatible with
+        # our minimal requirements
+        self.package_supported = True
+
+        # Recommend a new python version
+        self.recommend_python = None
+
+        # Certain hints are meant to be shown only once
+        self.distro_msg = None
+
+        self.latest_avail_ver = (0, 0, 0)
+        self.venv_ver = (0, 0, 0)
+
+        prefix = os.environ.get("srctree", ".") + "/"
+
+        self.conf = prefix + "Documentation/conf.py"
+        self.requirement_file = prefix + "Documentation/sphinx/requirements.txt"
+
+    def get_install_progs(self, progs, cmd, extra=None):
+        """
+        Check for missing dependencies using the provided program mapping.
+
+        The actual distro-specific programs are mapped via progs argument.
+        """
+        install = self.deps.check_missing(progs)
+
+        if self.verbose_warn_install:
+            self.deps.warn_install()
+
+        if not install:
+            return
+
+        if cmd:
+            if self.verbose_warn_install:
+                msg = "You should run:"
+            else:
+                msg = ""
+
+            if extra:
+                msg += "\n\t" + extra.replace("\n", "\n\t")
+
+            return(msg + "\n\tsudo " + cmd + " " + install)
+
+        return None
+
+    #
+    # Distro-specific hints methods
+    #
+
+    def give_debian_hints(self):
+        """
+        Provide package installation hints for Debian-based distros.
+        """
+        progs = {
+            "Pod::Usage":    "perl-modules",
+            "convert":       "imagemagick",
+            "dot":           "graphviz",
+            "ensurepip":     "python3-venv",
+            "python-sphinx": "python3-sphinx",
+            "rsvg-convert":  "librsvg2-bin",
+            "virtualenv":    "virtualenv",
+            "xelatex":       "texlive-xetex",
+            "yaml":          "python3-yaml",
+        }
+
+        if self.pdf:
+            pdf_pkgs = {
+                "fonts-dejavu": [
+                    "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
+                ],
+                "fonts-noto-cjk": [
+                    "/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc",
+                    "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
+                    "/usr/share/fonts/opentype/noto/NotoSerifCJK-Regular.ttc",
+                ],
+                "tex-gyre": [
+                    "/usr/share/texmf/tex/latex/tex-gyre/tgtermes.sty"
+                ],
+                "texlive-fonts-recommended": [
+                    "/usr/share/texlive/texmf-dist/fonts/tfm/adobe/zapfding/pzdr.tfm",
+                ],
+                "texlive-lang-chinese": [
+                    "/usr/share/texlive/texmf-dist/tex/latex/ctex/ctexhook.sty",
+                ],
+            }
+
+            for package, files in pdf_pkgs.items():
+                self.check_missing_file(files, package, DepManager.PDF_MANDATORY)
+
+            self.check_program("dvipng", DepManager.PDF_MANDATORY)
+
+        if not self.distro_msg:
+            self.distro_msg = \
+                "Note: ImageMagick is broken on some distros, affecting PDF output. For more details:\n" \
+                "\thttps://askubuntu.com/questions/1158894/imagemagick-still-broken-using-with-usr-bin-convert"
+
+        return self.get_install_progs(progs, "apt-get install")
+
+    def give_redhat_hints(self):
+        """
+        Provide package installation hints for RedHat-based distros
+        (Fedora, RHEL and RHEL-based variants).
+        """
+        progs = {
+            "Pod::Usage":       "perl-Pod-Usage",
+            "convert":          "ImageMagick",
+            "dot":              "graphviz",
+            "python-sphinx":    "python3-sphinx",
+            "rsvg-convert":     "librsvg2-tools",
+            "virtualenv":       "python3-virtualenv",
+            "xelatex":          "texlive-xetex-bin",
+            "yaml":             "python3-pyyaml",
+        }
+
+        fedora_tex_pkgs = [
+            "dejavu-sans-fonts",
+            "dejavu-sans-mono-fonts",
+            "dejavu-serif-fonts",
+            "texlive-collection-fontsrecommended",
+            "texlive-collection-latex",
+            "texlive-xecjk",
+        ]
+
+        fedora = False
+        rel = None
+
+        match = re.search(r"(release|Linux)\s+(\d+)", self.system_release)
+        if match:
+            rel = int(match.group(2))
+
+        if not rel:
+            print("Couldn't identify release number")
+            noto_sans_redhat = None
+            self.pdf = False
+        elif re.search("Fedora", self.system_release):
+            # Fedora 38 and upper use this CJK font
+
+            noto_sans_redhat = "google-noto-sans-cjk-fonts"
+            fedora = True
+        else:
+            # Almalinux, CentOS, RHEL, ...
+
+            # at least up to version 9 (and Fedora < 38), that's the CJK font
+            noto_sans_redhat = "google-noto-sans-cjk-ttc-fonts"
+
+            progs["virtualenv"] = "python-virtualenv"
+
+            if not rel or rel < 8:
+                print("ERROR: Distro not supported. Too old?")
+                return
+
+            # RHEL 8 uses Python 3.6, which is not compatible with
+            # the build system anymore. Suggest Python 3.11
+            if rel == 8:
+                self.check_program("python3.9", DepManager.SYSTEM_MANDATORY)
+                progs["python3.9"] = "python39"
+                progs["yaml"] = "python39-pyyaml"
+
+                self.recommend_python = True
+
+                # There's no python39-sphinx package. Only pip is supported
+                self.package_supported = False
+
+            if not self.distro_msg:
+                self.distro_msg = \
+                    "Note: RHEL-based distros typically require extra repositories.\n" \
+                    "For most, enabling epel and crb are enough:\n" \
+                    "\tsudo dnf install -y epel-release\n" \
+                    "\tsudo dnf config-manager --set-enabled crb\n" \
+                    "Yet, some may have other required repositories. Those commands could be useful:\n" \
+                    "\tsudo dnf repolist all\n" \
+                    "\tsudo dnf repoquery --available --info <pkgs>\n" \
+                    "\tsudo dnf config-manager --set-enabled '*' # enable all - probably not what you want"
+
+        if self.pdf:
+            pdf_pkgs = [
+                "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
+                "/usr/share/fonts/google-noto-sans-cjk-fonts/NotoSansCJK-Regular.ttc",
+            ]
+
+            self.check_missing_file(pdf_pkgs, noto_sans_redhat, DepManager.PDF_MANDATORY)
+
+            self.check_rpm_missing(fedora_tex_pkgs, DepManager.PDF_MANDATORY)
+
+            self.check_missing_tex(DepManager.PDF_MANDATORY)
+
+            # There's no texlive-ctex on RHEL 8 repositories. This will
+            # likely affect CJK pdf build only.
+            if not fedora and rel == 8:
+                self.deps.del_package("texlive-ctex")
+
+        return self.get_install_progs(progs, "dnf install")
+
+    def give_opensuse_hints(self):
+        """
+        Provide package installation hints for openSUSE-based distros
+        (Leap and Tumbleweed).
+        """
+        progs = {
+            "Pod::Usage":    "perl-Pod-Usage",
+            "convert":       "ImageMagick",
+            "dot":           "graphviz",
+            "python-sphinx": "python3-sphinx",
+            "virtualenv":    "python3-virtualenv",
+            "xelatex":       "texlive-xetex-bin texlive-dejavu",
+            "yaml":          "python3-pyyaml",
+        }
+
+        suse_tex_pkgs = [
+            "texlive-babel-english",
+            "texlive-caption",
+            "texlive-colortbl",
+            "texlive-courier",
+            "texlive-dvips",
+            "texlive-helvetic",
+            "texlive-makeindex",
+            "texlive-metafont",
+            "texlive-metapost",
+            "texlive-palatino",
+            "texlive-preview",
+            "texlive-times",
+            "texlive-zapfchan",
+            "texlive-zapfding",
+        ]
+
+        progs["latexmk"] = "texlive-latexmk-bin"
+
+        match = re.search(r"(Leap)\s+(\d+).(\d)", self.system_release)
+        if match:
+            rel = int(match.group(2))
+
+            # Leap 15.x uses Python 3.6, which is not compatible with
+            # the build system anymore. Suggest Python 3.11
+            if rel == 15:
+                if not self.which(self.python_cmd):
+                    self.check_program("python3.11", DepManager.SYSTEM_MANDATORY)
+                    progs["python3.11"] = "python311"
+                    self.recommend_python = True
+
+                progs.update({
+                    "python-sphinx": "python311-Sphinx python311-Sphinx-latex",
+                    "virtualenv":    "python311-virtualenv",
+                    "yaml":          "python311-PyYAML",
+                })
+        else:
+            # Tumbleweed defaults to Python 3.11
+
+            progs.update({
+                "python-sphinx": "python313-Sphinx python313-Sphinx-latex",
+                "virtualenv":    "python313-virtualenv",
+                "yaml":          "python313-PyYAML",
+            })
+
+        # FIXME: add support for installing CJK fonts
+        #
+        # I tried hard, but was unable to find a way to install
+        # "Noto Sans CJK SC" on openSUSE
+
+        if self.pdf:
+            self.check_rpm_missing(suse_tex_pkgs, DepManager.PDF_MANDATORY)
+        if self.pdf:
+            self.check_missing_tex()
+
+        return self.get_install_progs(progs, "zypper install --no-recommends")
+
+    def give_mageia_hints(self):
+        """
+        Provide package installation hints for Mageia and OpenMandriva.
+        """
+        progs = {
+            "Pod::Usage":    "perl-Pod-Usage",
+            "convert":       "ImageMagick",
+            "dot":           "graphviz",
+            "python-sphinx": "python3-sphinx",
+            "rsvg-convert":  "librsvg2",
+            "virtualenv":    "python3-virtualenv",
+            "xelatex":       "texlive",
+            "yaml":          "python3-yaml",
+        }
+
+        tex_pkgs = [
+            "texlive-fontsextra",
+            "texlive-fonts-asian",
+            "fonts-ttf-dejavu",
+        ]
+
+        if re.search(r"OpenMandriva", self.system_release):
+            packager_cmd = "dnf install"
+            noto_sans = "noto-sans-cjk-fonts"
+            tex_pkgs = [
+                "texlive-collection-basic",
+                "texlive-collection-langcjk",
+                "texlive-collection-fontsextra",
+                "texlive-collection-fontsrecommended"
+            ]
+
+            # Tested on OpenMandriva Lx 4.3
+            progs["convert"] = "imagemagick"
+            progs["yaml"] = "python-pyyaml"
+            progs["python-virtualenv"] = "python-virtualenv"
+            progs["python-sphinx"] = "python-sphinx"
+            progs["xelatex"] = "texlive"
+
+            self.check_program("python-virtualenv", DepManager.PYTHON_MANDATORY)
+
+            # On my tests with openMandriva LX 4.0 docker image, upgraded
+            # to 4.3, python-virtualenv package is broken: it is missing
+            # ensurepip. Without it, the alternative would be to run:
+            # python3 -m venv --without-pip ~/sphinx_latest, but running
+            # pip there won't install sphinx at venv.
+            #
+            # Add a note about that.
+
+            if not self.distro_msg:
+                self.distro_msg = \
+                    "Notes:\n"\
+                    "1. for venv, ensurepip could be broken, preventing its install method.\n" \
+                    "2. at least on OpenMandriva LX 4.3, texlive packages seem broken"
+
+        else:
+            packager_cmd = "urpmi"
+            noto_sans = "google-noto-sans-cjk-ttc-fonts"
+
+        progs["latexmk"] = "texlive-collection-basic"
+
+        if self.pdf:
+            pdf_pkgs = [
+                "/usr/share/fonts/google-noto-cjk/NotoSansCJK-Regular.ttc",
+                "/usr/share/fonts/TTF/NotoSans-Regular.ttf",
+            ]
+
+            self.check_missing_file(pdf_pkgs, noto_sans, DepManager.PDF_MANDATORY)
+            self.check_rpm_missing(tex_pkgs, DepManager.PDF_MANDATORY)
+
+        return self.get_install_progs(progs, packager_cmd)
+
+    def give_arch_linux_hints(self):
+        """
+        Provide package installation hints for ArchLinux.
+        """
+        progs = {
+            "convert":      "imagemagick",
+            "dot":          "graphviz",
+            "latexmk":      "texlive-core",
+            "rsvg-convert": "extra/librsvg",
+            "virtualenv":   "python-virtualenv",
+            "xelatex":      "texlive-xetex",
+            "yaml":         "python-yaml",
+        }
+
+        archlinux_tex_pkgs = [
+            "texlive-basic",
+            "texlive-binextra",
+            "texlive-core",
+            "texlive-fontsrecommended",
+            "texlive-langchinese",
+            "texlive-langcjk",
+            "texlive-latexextra",
+            "ttf-dejavu",
+        ]
+
+        if self.pdf:
+            self.check_pacman_missing(archlinux_tex_pkgs,
+                                      DepManager.PDF_MANDATORY)
+
+            self.check_missing_file(["/usr/share/fonts/noto-cjk/NotoSansCJK-Regular.ttc"],
+                                    "noto-fonts-cjk",
+                                    DepManager.PDF_MANDATORY)
+
+
+        return self.get_install_progs(progs, "pacman -S")
+
+    def give_gentoo_hints(self):
+        """
+        Provide package installation hints for Gentoo.
+        """
+        texlive_deps = [
+            "dev-texlive/texlive-fontsrecommended",
+            "dev-texlive/texlive-latexextra",
+            "dev-texlive/texlive-xetex",
+            "media-fonts/dejavu",
+        ]
+
+        progs = {
+            "convert":       "media-gfx/imagemagick",
+            "dot":           "media-gfx/graphviz",
+            "rsvg-convert":  "gnome-base/librsvg",
+            "virtualenv":    "dev-python/virtualenv",
+            "xelatex":       " ".join(texlive_deps),
+            "yaml":          "dev-python/pyyaml",
+            "python-sphinx": "dev-python/sphinx",
+        }
+
+        if self.pdf:
+            pdf_pkgs = {
+                "media-fonts/dejavu": [
+                    "/usr/share/fonts/dejavu/DejaVuSans.ttf",
+                ],
+                "media-fonts/noto-cjk": [
+                    "/usr/share/fonts/noto-cjk/NotoSansCJKsc-Regular.otf",
+                    "/usr/share/fonts/noto-cjk/NotoSerifCJK-Regular.ttc",
+                ],
+            }
+            for package, files in pdf_pkgs.items():
+                self.check_missing_file(files, package, DepManager.PDF_MANDATORY)
+
+        # Handling dependencies is a nightmare, as Gentoo refuses to emerge
+        # some packages if there's no package.use file describing them.
+        # To make it worse, compilation flags shall also be present there
+        # for some packages. If USE is not perfect, error/warning messages
+        #   like those are shown:
+        #
+        #   !!! The following binary packages have been ignored due to non matching USE:
+        #
+        #    =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_13 qt6 svg
+        #    =media-gfx/graphviz-12.2.1-r1 X pdf python_single_target_python3_12 -python_single_target_python3_13 qt6 svg
+        #    =media-gfx/graphviz-12.2.1-r1 X pdf qt6 svg
+        #    =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 qt6 svg
+        #    =media-gfx/graphviz-12.2.1-r1 X pdf -python_single_target_python3_10 python_single_target_python3_12 -python_single_target_python3_13 qt6 svg
+        #    =media-fonts/noto-cjk-20190416 X
+        #    =app-text/texlive-core-2024-r1 X cjk -xetex
+        #    =app-text/texlive-core-2024-r1 X -xetex
+        #    =app-text/texlive-core-2024-r1 -xetex
+        #    =dev-libs/zziplib-0.13.79-r1 sdl
+        #
+        # And will ignore such packages, installing the remaining ones. That
+        # affects mostly the image extension and PDF generation.
+
+        # Package dependencies and the minimal needed args:
+        portages = {
+            "graphviz": "media-gfx/graphviz",
+            "imagemagick": "media-gfx/imagemagick",
+            "media-libs": "media-libs/harfbuzz icu",
+            "media-fonts": "media-fonts/noto-cjk",
+            "texlive": "app-text/texlive-core xetex",
+            "zziblib": "dev-libs/zziplib sdl",
+        }
+
+        extra_cmds = ""
+        if not self.distro_msg:
+            self.distro_msg = "Note: Gentoo requires package.use to be adjusted before emerging packages"
+
+            use_base = "/etc/portage/package.use"
+            files = glob(f"{use_base}/*")
+
+            for fname, portage in portages.items():
+                install = False
+
+                while install is False:
+                    if not files:
+                        # No files under package.usage. Install all
+                        install = True
+                        break
+
+                    args = portage.split(" ")
+
+                    name = args.pop(0)
+
+                    cmd = ["grep", "-l", "-E", rf"^{name}\b" ] + files
+                    result = self.run(cmd, stdout=subprocess.PIPE, text=True)
+                    if result.returncode or not result.stdout.strip():
+                        # File containing portage name not found
+                        install = True
+                        break
+
+                    # Ensure that needed USE flags are present
+                    if args:
+                        match_fname = result.stdout.strip()
+                        with open(match_fname, 'r', encoding='utf8',
+                                errors='backslashreplace') as fp:
+                            for line in fp:
+                                for arg in args:
+                                    if arg.startswith("-"):
+                                        continue
+
+                                if not re.search(rf"\s*{arg}\b", line):
+                                    # Needed file argument not found
+                                    install = True
+                                    break
+
+                    # Everything looks ok, don't install
+                    break
+
+                # emit a code to setup missing USE
+                if install:
+                    extra_cmds += (f"sudo su -c 'echo \"{portage}\" > {use_base}/{fname}'\n")
+
+        # Now, we can use emerge and let it respect USE
+        return self.get_install_progs(progs,
+                                      "emerge --ask --changed-use --binpkg-respect-use=y",
+                                      extra_cmds)
+
+    def get_install(self):
+        """
+        OS-specific hints logic. Seeks for a hinter. If found, use it to
+        provide package-manager specific install commands.
+
+        Otherwise, outputs install instructions for the meta-packages.
+
+        Returns a string with the command to be executed to install the
+        the needed packages, if distro found. Otherwise, return just a
+        list of packages that require installation.
+        """
+        os_hints = {
+            re.compile("Red Hat Enterprise Linux"):   self.give_redhat_hints,
+            re.compile("Fedora"):                     self.give_redhat_hints,
+            re.compile("AlmaLinux"):                  self.give_redhat_hints,
+            re.compile("Amazon Linux"):               self.give_redhat_hints,
+            re.compile("CentOS"):                     self.give_redhat_hints,
+            re.compile("openEuler"):                  self.give_redhat_hints,
+            re.compile("Oracle Linux Server"):        self.give_redhat_hints,
+            re.compile("Rocky Linux"):                self.give_redhat_hints,
+            re.compile("Springdale Open Enterprise"): self.give_redhat_hints,
+
+            re.compile("Ubuntu"):                     self.give_debian_hints,
+            re.compile("Debian"):                     self.give_debian_hints,
+            re.compile("Devuan"):                     self.give_debian_hints,
+            re.compile("Kali"):                       self.give_debian_hints,
+            re.compile("Mint"):                       self.give_debian_hints,
+
+            re.compile("openSUSE"):                   self.give_opensuse_hints,
+
+            re.compile("Mageia"):                     self.give_mageia_hints,
+            re.compile("OpenMandriva"):               self.give_mageia_hints,
+
+            re.compile("Arch Linux"):                 self.give_arch_linux_hints,
+            re.compile("Gentoo"):                     self.give_gentoo_hints,
+        }
+
+        # If the OS is detected, use per-OS hint logic
+        for regex, os_hint in os_hints.items():
+            if regex.search(self.system_release):
+                return os_hint()
+
+        #
+        # Fall-back to generic hint code for other distros
+        # That's far from ideal, specially for LaTeX dependencies.
+        #
+        progs = {"sphinx-build": "sphinx"}
+        if self.pdf:
+            self.check_missing_tex()
+
+        self.distro_msg = \
+            f"I don't know distro {self.system_release}.\n" \
+            "So, I can't provide you a hint with the install procedure.\n" \
+            "There are likely missing dependencies."
+
+        return self.get_install_progs(progs, None)
+
+    #
+    # Common dependencies
+    #
+    def deactivate_help(self):
+        """
+        Print a helper message to disable a virtual environment.
+        """
+
+        print("\n    If you want to exit the virtualenv, you can use:")
+        print("\tdeactivate")
+
+    def get_virtenv(self):
+        """
+        Give a hint about how to activate an already-existing virtual
+        environment containing sphinx-build.
+
+        Returns a tuble with (activate_cmd_path, sphinx_version) with
+        the newest available virtual env.
+        """
+
+        cwd = os.getcwd()
+
+        activates = []
+
+        # Add all sphinx prefixes with possible version numbers
+        for p in self.virtenv_prefix:
+            activates += glob(f"{cwd}/{p}[0-9]*/bin/activate")
+
+        activates.sort(reverse=True, key=str.lower)
+
+        # Place sphinx_latest first, if it exists
+        for p in self.virtenv_prefix:
+            activates = glob(f"{cwd}/{p}*latest/bin/activate") + activates
+
+        ver = (0, 0, 0)
+        for f in activates:
+            # Discard too old Sphinx virtual environments
+            match = re.search(r"(\d+)\.(\d+)\.(\d+)", f)
+            if match:
+                ver = (int(match.group(1)), int(match.group(2)), int(match.group(3)))
+
+                if ver < self.min_version:
+                    continue
+
+            sphinx_cmd = f.replace("activate", "sphinx-build")
+            if not os.path.isfile(sphinx_cmd):
+                continue
+
+            ver = self.get_sphinx_version(sphinx_cmd)
+
+            if not ver:
+                venv_dir = f.replace("/bin/activate", "")
+                print(f"Warning: virtual environment {venv_dir} is not working.\n" \
+                      "Python version upgrade? Remove it with:\n\n" \
+                      "\trm -rf {venv_dir}\n\n")
+            else:
+                if self.need_sphinx and ver >= self.min_version:
+                    return (f, ver)
+                elif parse_version(ver) > self.cur_version:
+                    return (f, ver)
+
+        return ("", ver)
+
+    def recommend_sphinx_upgrade(self):
+        """
+        Check if Sphinx needs to be upgraded.
+
+        Returns a tuple with the higest available Sphinx version if found.
+        Otherwise, returns None to indicate either that no upgrade is needed
+        or no venv was found.
+        """
+
+        # Avoid running sphinx-builds from venv if cur_version is good
+        if self.cur_version and self.cur_version >= RECOMMENDED_VERSION:
+            self.latest_avail_ver = self.cur_version
+            return None
+
+        # Get the highest version from sphinx_*/bin/sphinx-build and the
+        # corresponding command to activate the venv/virtenv
+        self.activate_cmd, self.venv_ver = self.get_virtenv()
+
+        # Store the highest version from Sphinx existing virtualenvs
+        if self.activate_cmd and self.venv_ver > self.cur_version:
+            self.latest_avail_ver = self.venv_ver
+        else:
+            if self.cur_version:
+                self.latest_avail_ver = self.cur_version
+            else:
+                self.latest_avail_ver = (0, 0, 0)
+
+        # As we don't know package version of Sphinx, and there's no
+        # virtual environments, don't check if upgrades are needed
+        if not self.virtualenv:
+            if not self.latest_avail_ver:
+                return None
+
+            return self.latest_avail_ver
+
+        # Either there are already a virtual env or a new one should be created
+        self.need_pip = True
+
+        if not self.latest_avail_ver:
+            return None
+
+        # Return if the reason is due to an upgrade or not
+        if self.latest_avail_ver != (0, 0, 0):
+            if self.latest_avail_ver < RECOMMENDED_VERSION:
+                self.rec_sphinx_upgrade = 1
+
+        return self.latest_avail_ver
+
+    def recommend_package(self):
+        """
+        Recommend installing Sphinx as a distro-specific package.
+        """
+
+        print("\n2) As a package with:")
+
+        old_need = self.deps.need
+        old_optional = self.deps.optional
+
+        self.pdf = False
+        self.deps.optional = 0
+        old_verbose = self.verbose_warn_install
+        self.verbose_warn_install = 0
+
+        self.deps.clear_deps()
+
+        self.deps.add_package("python-sphinx", DepManager.PYTHON_MANDATORY)
+
+        cmd = self.get_install()
+        if cmd:
+            print(cmd)
+
+        self.deps.need = old_need
+        self.deps.optional = old_optional
+        self.verbose_warn_install = old_verbose
+
+    def recommend_sphinx_version(self, virtualenv_cmd):
+        """
+        Provide recommendations for installing or upgrading Sphinx based
+        on current version.
+
+        The logic here is complex, as it have to deal with different versions:
+
+        - minimal supported version;
+        - minimal PDF version;
+        - recommended version.
+
+        It also needs to work fine with both distro's package and
+        venv/virtualenv
+        """
+
+        if self.recommend_python:
+            cur_ver = sys.version_info[:3]
+            if cur_ver < MIN_PYTHON_VERSION:
+                print(f"\nPython version {cur_ver} is incompatible with doc build.\n" \
+                    "Please upgrade it and re-run.\n")
+                return
+
+        # Version is OK. Nothing to do.
+        if self.cur_version != (0, 0, 0) and self.cur_version >= RECOMMENDED_VERSION:
+            return
+
+        if self.latest_avail_ver:
+            latest_avail_ver = ver_str(self.latest_avail_ver)
+
+        if not self.need_sphinx:
+            # sphinx-build is present and its version is >= $min_version
+
+            # only recommend enabling a newer virtenv version if makes sense.
+            if self.latest_avail_ver and self.latest_avail_ver > self.cur_version:
+                print(f"\nYou may also use the newer Sphinx version {latest_avail_ver} with:")
+                if f"{self.virtenv_prefix}" in os.getcwd():
+                    print("\tdeactivate")
+                print(f"\t. {self.activate_cmd}")
+                self.deactivate_help()
+                return
+
+            if self.latest_avail_ver and self.latest_avail_ver >= RECOMMENDED_VERSION:
+                return
+
+        if not self.virtualenv:
+            # No sphinx either via package or via virtenv. As we can't
+            # Compare the versions here, just return, recommending the
+            # user to install it from the package distro.
+            if not self.latest_avail_ver or self.latest_avail_ver == (0, 0, 0):
+                return
+
+            # User doesn't want a virtenv recommendation, but he already
+            # installed one via virtenv with a newer version.
+            # So, print commands to enable it
+            if self.latest_avail_ver > self.cur_version:
+                print(f"\nYou may also use the Sphinx virtualenv version {latest_avail_ver} with:")
+                if f"{self.virtenv_prefix}" in os.getcwd():
+                    print("\tdeactivate")
+                print(f"\t. {self.activate_cmd}")
+                self.deactivate_help()
+                return
+            print("\n")
+        else:
+            if self.need_sphinx:
+                self.deps.need += 1
+
+        # Suggest newer versions if current ones are too old
+        if self.latest_avail_ver and self.latest_avail_ver >= self.min_version:
+            if self.latest_avail_ver >= RECOMMENDED_VERSION:
+                print(f"\nNeed to activate Sphinx (version {latest_avail_ver}) on virtualenv with:")
+                print(f"\t. {self.activate_cmd}")
+                self.deactivate_help()
+                return
+
+            # Version is above the minimal required one, but may be
+            # below the recommended one. So, print warnings/notes
+            if self.latest_avail_ver < RECOMMENDED_VERSION:
+                print(f"Warning: It is recommended at least Sphinx version {RECOMMENDED_VERSION}.")
+
+        # At this point, either it needs Sphinx or upgrade is recommended,
+        # both via pip
+
+        if self.rec_sphinx_upgrade:
+            if not self.virtualenv:
+                print("Instead of install/upgrade Python Sphinx pkg, you could use pip/pypi with:\n\n")
+            else:
+                print("To upgrade Sphinx, use:\n\n")
+        else:
+            print("\nSphinx needs to be installed either:\n1) via pip/pypi with:\n")
+
+        if not virtualenv_cmd:
+            print("   Currently not possible.\n")
+            print("   Please upgrade Python to a newer version and run this script again")
+        else:
+            print(f"\t{virtualenv_cmd} {self.virtenv_dir}")
+            print(f"\t. {self.virtenv_dir}/bin/activate")
+            print(f"\tpip install -r {self.requirement_file}")
+            self.deactivate_help()
+
+        if self.package_supported:
+            self.recommend_package()
+
+        print("\n" \
+              "   Please note that Sphinx currentlys produce false-positive\n" \
+              "   warnings when the same name is used for more than one type (functions,\n" \
+              "   structs, enums,...). This is known Sphinx bug. For more details, see:\n" \
+              "\thttps://github.com/sphinx-doc/sphinx/pull/8313")
+
+    def check_needs(self):
+        """
+        Main method that checks needed dependencies and provides
+        recommendations.
+        """
+        self.python_cmd = sys.executable
+
+        # Check if Sphinx is already accessible from current environment
+        self.check_sphinx(self.conf)
+
+        if self.system_release:
+            print(f"Detected OS: {self.system_release}.")
+        else:
+            print("Unknown OS")
+        if self.cur_version != (0, 0, 0):
+            ver = ver_str(self.cur_version)
+            print(f"Sphinx version: {ver}\n")
+
+        # Check the type of virtual env, depending on Python version
+        virtualenv_cmd = None
+
+        if sys.version_info < MIN_PYTHON_VERSION:
+            min_ver = ver_str(MIN_PYTHON_VERSION)
+            print(f"ERROR: at least python {min_ver} is required to build the kernel docs")
+            self.need_sphinx = 1
+
+        self.venv_ver = self.recommend_sphinx_upgrade()
+
+        if self.need_pip:
+            if sys.version_info < MIN_PYTHON_VERSION:
+                self.need_pip = False
+                print("Warning: python version is not supported.")
+            else:
+                virtualenv_cmd = f"{self.python_cmd} -m venv"
+                self.check_python_module("ensurepip")
+
+        # Check for needed programs/tools
+        self.check_perl_module("Pod::Usage", DepManager.SYSTEM_MANDATORY)
+
+        self.check_program("make", DepManager.SYSTEM_MANDATORY)
+        self.check_program("which", DepManager.SYSTEM_MANDATORY)
+
+        self.check_program("dot", DepManager.SYSTEM_OPTIONAL)
+        self.check_program("convert", DepManager.SYSTEM_OPTIONAL)
+
+        self.check_python_module("yaml")
+
+        if self.pdf:
+            self.check_program("xelatex", DepManager.PDF_MANDATORY)
+            self.check_program("rsvg-convert", DepManager.PDF_MANDATORY)
+            self.check_program("latexmk", DepManager.PDF_MANDATORY)
+
+        # Do distro-specific checks and output distro-install commands
+        cmd = self.get_install()
+        if cmd:
+            print(cmd)
+
+        # If distro requires some special instructions, print here.
+        # Please notice that get_install() needs to be called first.
+        if self.distro_msg:
+            print("\n" + self.distro_msg)
+
+        if not self.python_cmd:
+            if self.need == 1:
+                sys.exit("Can't build as 1 mandatory dependency is missing")
+            elif self.need:
+                sys.exit(f"Can't build as {self.need} mandatory dependencies are missing")
+
+        # Check if sphinx-build is called sphinx-build-3
+        if self.need_symlink:
+            sphinx_path = self.which("sphinx-build-3")
+            if sphinx_path:
+                print(f"\tsudo ln -sf {sphinx_path} /usr/bin/sphinx-build\n")
+
+        self.recommend_sphinx_version(virtualenv_cmd)
+        print("")
+
+        if not self.deps.optional:
+            print("All optional dependencies are met.")
+
+        if self.deps.need == 1:
+            sys.exit("Can't build as 1 mandatory dependency is missing")
+        elif self.deps.need:
+            sys.exit(f"Can't build as {self.deps.need} mandatory dependencies are missing")
+
+        print("Needed package dependencies are met.")
+
+DESCRIPTION = """
+Process some flags related to Sphinx installation and documentation build.
+"""
+
+
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser(description=DESCRIPTION)
+
+    parser.add_argument(
+        "--no-virtualenv",
+        action="store_false",
+        dest="virtualenv",
+        help="Recommend installing Sphinx instead of using a virtualenv",
+    )
+
+    parser.add_argument(
+        "--no-pdf",
+        action="store_false",
+        dest="pdf",
+        help="Don't check for dependencies required to build PDF docs",
+    )
+
+    parser.add_argument(
+        "--version-check",
+        action="store_true",
+        dest="version_check",
+        help="If version is compatible, don't check for missing dependencies",
+    )
 
-sub deactivate_help()
-{
-	printf "\n    If you want to exit the virtualenv, you can use:\n";
-	printf "\tdeactivate\n";
-}
-
-sub get_virtenv()
-{
-	my $ver;
-	my $min_activate = "$ENV{'PWD'}/${virtenv_prefix}${min_version}/bin/activate";
-	my @activates = glob "$ENV{'PWD'}/${virtenv_prefix}*/bin/activate";
-
-	@activates = sort {$b cmp $a} @activates;
-
-	foreach my $f (@activates) {
-		next if ($f lt $min_activate);
-
-		my $sphinx_cmd = $f;
-		$sphinx_cmd =~ s/activate/sphinx-build/;
-		next if (! -f $sphinx_cmd);
-
-		my $ver = get_sphinx_version($sphinx_cmd);
-
-		if (!$ver) {
-			$f =~ s#/bin/activate##;
-			print("Warning: virtual environment $f is not working.\nPython version upgrade? Remove it with:\n\n\trm -rf $f\n\n");
-		}
-
-		if ($need_sphinx && ($ver ge $min_version)) {
-			return ($f, $ver);
-		} elsif ($ver gt $cur_version) {
-			return ($f, $ver);
-		}
-	}
-	return ("", "");
-}
-
-sub recommend_sphinx_upgrade()
-{
-	my $venv_ver;
-
-	# Avoid running sphinx-builds from venv if $cur_version is good
-	if ($cur_version && ($cur_version ge $rec_version)) {
-		$latest_avail_ver = $cur_version;
-		return;
-	}
-
-	# Get the highest version from sphinx_*/bin/sphinx-build and the
-	# corresponding command to activate the venv/virtenv
-	($activate_cmd, $venv_ver) = get_virtenv();
-
-	# Store the highest version from Sphinx existing virtualenvs
-	if (($activate_cmd ne "") && ($venv_ver gt $cur_version)) {
-		$latest_avail_ver = $venv_ver;
-	} else {
-		$latest_avail_ver = $cur_version if ($cur_version);
-	}
-
-	# As we don't know package version of Sphinx, and there's no
-	# virtual environments, don't check if upgrades are needed
-	if (!$virtualenv) {
-		return if (!$latest_avail_ver);
-	}
-
-	# Either there are already a virtual env or a new one should be created
-	$need_pip = 1;
-
-	return if (!$latest_avail_ver);
-
-	# Return if the reason is due to an upgrade or not
-	if ($latest_avail_ver lt $rec_version) {
-		$rec_sphinx_upgrade = 1;
-	}
-
-	return $latest_avail_ver;
-}
-
-#
-# The logic here is complex, as it have to deal with different versions:
-#	- minimal supported version;
-#	- minimal PDF version;
-#	- recommended version.
-# It also needs to work fine with both distro's package and venv/virtualenv
-sub recommend_sphinx_version($)
-{
-	my $virtualenv_cmd = shift;
-
-	# Version is OK. Nothing to do.
-	if ($cur_version && ($cur_version ge $rec_version)) {
-		return;
-	};
-
-	if (!$need_sphinx) {
-		# sphinx-build is present and its version is >= $min_version
-
-		#only recommend enabling a newer virtenv version if makes sense.
-		if ($latest_avail_ver gt $cur_version) {
-			printf "\nYou may also use the newer Sphinx version $latest_avail_ver with:\n";
-			printf "\tdeactivate\n"  if ($ENV{'PWD'} =~ /${virtenv_prefix}/);
-			printf "\t. $activate_cmd\n";
-			deactivate_help();
-
-			return;
-		}
-		return if ($latest_avail_ver ge $rec_version);
-	}
-
-	if (!$virtualenv) {
-		# No sphinx either via package or via virtenv. As we can't
-		# Compare the versions here, just return, recommending the
-		# user to install it from the package distro.
-		return if (!$latest_avail_ver);
-
-		# User doesn't want a virtenv recommendation, but he already
-		# installed one via virtenv with a newer version.
-		# So, print commands to enable it
-		if ($latest_avail_ver gt $cur_version) {
-			printf "\nYou may also use the Sphinx virtualenv version $latest_avail_ver with:\n";
-			printf "\tdeactivate\n"  if ($ENV{'PWD'} =~ /${virtenv_prefix}/);
-			printf "\t. $activate_cmd\n";
-			deactivate_help();
-
-			return;
-		}
-		print "\n";
-	} else {
-		$need++ if ($need_sphinx);
-	}
-
-	# Suggest newer versions if current ones are too old
-	if ($latest_avail_ver && $latest_avail_ver ge $min_version) {
-		# If there's a good enough version, ask the user to enable it
-		if ($latest_avail_ver ge $rec_version) {
-			printf "\nNeed to activate Sphinx (version $latest_avail_ver) on virtualenv with:\n";
-			printf "\t. $activate_cmd\n";
-			deactivate_help();
-
-			return;
-		}
-
-		# Version is above the minimal required one, but may be
-		# below the recommended one. So, print warnings/notes
-
-		if ($latest_avail_ver lt $rec_version) {
-			print "Warning: It is recommended at least Sphinx version $rec_version.\n";
-		}
-	}
-
-	# At this point, either it needs Sphinx or upgrade is recommended,
-	# both via pip
-
-	if ($rec_sphinx_upgrade) {
-		if (!$virtualenv) {
-			print "Instead of install/upgrade Python Sphinx pkg, you could use pip/pypi with:\n\n";
-		} else {
-			print "To upgrade Sphinx, use:\n\n";
-		}
-	} else {
-		print "\nSphinx needs to be installed either:\n1) via pip/pypi with:\n\n";
-	}
-
-	$python_cmd = find_python_no_venv();
-
-	printf "\t$virtualenv_cmd $virtenv_dir\n";
-
-	printf "\t. $virtenv_dir/bin/activate\n";
-	printf "\tpip install -r $requirement_file\n";
-	deactivate_help();
-
-	printf "\n2) As a package with:\n";
-
-	my $old_need = $need;
-	my $old_optional = $optional;
-	%missing = ();
-	$pdf = 0;
-	$optional = 0;
-	$install = "";
-	$verbose_warn_install = 0;
-
-	add_package("python-sphinx", 0);
-
-	check_distros();
-
-	$need = $old_need;
-	$optional = $old_optional;
-
-	printf "\n    Please note that Sphinx >= 3.0 will currently produce false-positive\n";
-	printf "   warning when the same name is used for more than one type (functions,\n";
-	printf "   structs, enums,...). This is known Sphinx bug. For more details, see:\n";
-	printf "\thttps://github.com/sphinx-doc/sphinx/pull/8313\n";
-}
-
-sub check_needs()
-{
-	# Check if Sphinx is already accessible from current environment
-	check_sphinx();
-
-	if ($system_release) {
-		print "Detected OS: $system_release.\n";
-	} else {
-		print "Unknown OS\n";
-	}
-	printf "Sphinx version: %s\n\n", $cur_version if ($cur_version);
-
-	# Check python command line, trying first python3
-	$python_cmd = findprog("python3");
-	$python_cmd = check_program("python", 0) if (!$python_cmd);
-
-	# Check the type of virtual env, depending on Python version
-	if ($python_cmd) {
-		if ($virtualenv) {
-			my $tmp = qx($python_cmd --version 2>&1);
-			if ($tmp =~ m/(\d+\.)(\d+\.)/) {
-				if ($1 < 3) {
-					# Fail if it finds python2 (or worse)
-					die "Python 3 is required to build the kernel docs\n";
-				}
-				if ($1 == 3 && $2 < 3) {
-					# Need Python 3.3 or upper for venv
-					$need_virtualenv = 1;
-				}
-			} else {
-				die "Warning: couldn't identify $python_cmd version!";
-			}
-		} else {
-			add_package("python-sphinx", 0);
-		}
-	}
-
-	my $venv_ver = recommend_sphinx_upgrade();
-
-	my $virtualenv_cmd;
-
-	if ($need_pip) {
-		# Set virtualenv command line, if python < 3.3
-		if ($need_virtualenv) {
-			$virtualenv_cmd = findprog("virtualenv-3");
-			$virtualenv_cmd = findprog("virtualenv-3.5") if (!$virtualenv_cmd);
-			if (!$virtualenv_cmd) {
-				check_program("virtualenv", 0);
-				$virtualenv_cmd = "virtualenv";
-			}
-		} else {
-			$virtualenv_cmd = "$python_cmd -m venv";
-			check_python_module("ensurepip", 0);
-		}
-	}
-
-	# Check for needed programs/tools
-	check_perl_module("Pod::Usage", 0);
-	check_python_module("yaml", 0);
-	check_program("make", 0);
-	check_program("gcc", 0);
-	check_program("dot", 1);
-	check_program("convert", 1);
-
-	# Extra PDF files - should use 2 for is_optional
-	check_program("xelatex", 2) if ($pdf);
-	check_program("rsvg-convert", 2) if ($pdf);
-	check_program("latexmk", 2) if ($pdf);
-
-	# Do distro-specific checks and output distro-install commands
-	check_distros();
-
-	if (!$python_cmd) {
-		if ($need == 1) {
-			die "Can't build as $need mandatory dependency is missing";
-		} elsif ($need) {
-			die "Can't build as $need mandatory dependencies are missing";
-		}
-	}
-
-	# Check if sphinx-build is called sphinx-build-3
-	if ($need_symlink) {
-		printf "\tsudo ln -sf %s /usr/bin/sphinx-build\n\n",
-		       which("sphinx-build-3");
-	}
-
-	recommend_sphinx_version($virtualenv_cmd);
-	printf "\n";
-
-	print "All optional dependencies are met.\n" if (!$optional);
-
-	if ($need == 1) {
-		die "Can't build as $need mandatory dependency is missing";
-	} elsif ($need) {
-		die "Can't build as $need mandatory dependencies are missing";
-	}
-
-	print "Needed package dependencies are met.\n";
-}
-
-#
-# Main
-#
-
-while (@ARGV) {
-	my $arg = shift(@ARGV);
-
-	if ($arg eq "--no-virtualenv") {
-		$virtualenv = 0;
-	} elsif ($arg eq "--no-pdf"){
-		$pdf = 0;
-	} elsif ($arg eq "--version-check"){
-		$version_check = 1;
-	} else {
-		print "Usage:\n\t$0 <--no-virtualenv> <--no-pdf> <--version-check>\n\n";
-		print "Where:\n";
-		print "\t--no-virtualenv\t- Recommend installing Sphinx instead of using a virtualenv\n";
-		print "\t--version-check\t- if version is compatible, don't check for missing dependencies\n";
-		print "\t--no-pdf\t- don't check for dependencies required to build PDF docs\n\n";
-		exit -1;
-	}
-}
-
-#
-# Determine the system type. There's no standard unique way that would
-# work with all distros with a minimal package install. So, several
-# methods are used here.
-#
-# By default, it will use lsb_release function. If not available, it will
-# fail back to reading the known different places where the distro name
-# is stored
-#
+    args = parser.parse_args()
+
+    checker = SphinxDependencyChecker(args)
 
-$system_release = qx(lsb_release -d) if which("lsb_release");
-$system_release =~ s/Description:\s*// if ($system_release);
-$system_release = catcheck("/etc/system-release") if !$system_release;
-$system_release = catcheck("/etc/redhat-release") if !$system_release;
-$system_release = catcheck("/etc/lsb-release") if !$system_release;
-$system_release = catcheck("/etc/gentoo-release") if !$system_release;
-
-# This seems more common than LSB these days
-if (!$system_release) {
-	my %os_var;
-	if (open IN, "cat /etc/os-release|") {
-		while (<IN>) {
-			if (m/^([\w\d\_]+)=\"?([^\"]*)\"?\n/) {
-				$os_var{$1}=$2;
-			}
-		}
-		$system_release = $os_var{"NAME"};
-		if (defined($os_var{"VERSION_ID"})) {
-			$system_release .= " " . $os_var{"VERSION_ID"} if (defined($os_var{"VERSION_ID"}));
-		} else {
-			$system_release .= " " . $os_var{"VERSION"};
-		}
-	}
-}
-$system_release = catcheck("/etc/issue") if !$system_release;
-$system_release =~ s/\s+$//;
-
-check_needs;
+    checker.check_python()
+    checker.check_needs()
+
+# Call main if not used as module
+if __name__ == "__main__":
+    main()
diff --git a/security/apparmor/af_unix.c b/security/apparmor/af_unix.c
index 9129766d1e9c..ac0f4be791ec 100644
--- a/security/apparmor/af_unix.c
+++ b/security/apparmor/af_unix.c
@@ -31,7 +31,7 @@ static inline struct sock *aa_unix_sk(struct unix_sock *u)
 }
 
 static int unix_fs_perm(const char *op, u32 mask, const struct cred *subj_cred,
-			struct aa_label *label, struct path *path)
+			struct aa_label *label, const struct path *path)
 {
 	AA_BUG(!label);
 	AA_BUG(!path);
@@ -224,7 +224,7 @@ static int profile_create_perm(struct aa_profile *profile, int family,
 
 static int profile_sk_perm(struct aa_profile *profile,
 			   struct apparmor_audit_data *ad,
-			   u32 request, struct sock *sk, struct path *path)
+			   u32 request, struct sock *sk, const struct path *path)
 {
 	struct aa_ruleset *rules = profile->label.rules[0];
 	struct aa_perms *p = NULL;
@@ -386,9 +386,9 @@ static int profile_opt_perm(struct aa_profile *profile, u32 request,
 
 /* null peer_label is allowed, in which case the peer_sk label is used */
 static int profile_peer_perm(struct aa_profile *profile, u32 request,
-			     struct sock *sk, struct path *path,
+			     struct sock *sk, const struct path *path,
 			     struct sockaddr_un *peer_addr,
-			     int peer_addrlen, struct path *peer_path,
+			     int peer_addrlen, const struct path *peer_path,
 			     struct aa_label *peer_label,
 			     struct apparmor_audit_data *ad)
 {
@@ -445,7 +445,7 @@ int aa_unix_create_perm(struct aa_label *label, int family, int type,
 static int aa_unix_label_sk_perm(const struct cred *subj_cred,
 				 struct aa_label *label,
 				 const char *op, u32 request, struct sock *sk,
-				 struct path *path)
+				 const struct path *path)
 {
 	if (!unconfined(label)) {
 		struct aa_profile *profile;
@@ -599,9 +599,9 @@ int aa_unix_opt_perm(const char *op, u32 request, struct socket *sock,
 
 static int unix_peer_perm(const struct cred *subj_cred,
 			  struct aa_label *label, const char *op, u32 request,
-			  struct sock *sk, struct path *path,
+			  struct sock *sk, const struct path *path,
 			  struct sockaddr_un *peer_addr, int peer_addrlen,
-			  struct path *peer_path, struct aa_label *peer_label)
+			  const struct path *peer_path, struct aa_label *peer_label)
 {
 	struct aa_profile *profile;
 	DEFINE_AUDIT_SK(ad, op, subj_cred, sk);
diff --git a/tools/docs/gen-redirects.py b/tools/docs/gen-redirects.py
new file mode 100755
index 000000000000..6a6ebf6f42dc
--- /dev/null
+++ b/tools/docs/gen-redirects.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright © 2025, Oracle and/or its affiliates.
+# Author: Vegard Nossum <vegard.nossum@oracle.com>
+
+"""Generate HTML redirects for renamed Documentation/**.rst files using
+the output of tools/docs/gen-renames.py.
+
+Example:
+
+    tools/docs/gen-redirects.py --output Documentation/output/ < Documentation/.renames.txt
+"""
+
+import argparse
+import os
+import sys
+
+parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument('-o', '--output', help='output directory')
+
+args = parser.parse_args()
+
+for line in sys.stdin:
+    line = line.rstrip('\n')
+
+    old_name, new_name = line.split(' ', 2)
+
+    old_html_path = os.path.join(args.output, old_name + '.html')
+    new_html_path = os.path.join(args.output, new_name + '.html')
+
+    if not os.path.exists(new_html_path):
+        print(f"warning: target does not exist: {new_html_path} (redirect from {old_html_path})")
+        continue
+
+    old_html_dir = os.path.dirname(old_html_path)
+    if not os.path.exists(old_html_dir):
+        os.makedirs(old_html_dir)
+
+    relpath = os.path.relpath(new_name, os.path.dirname(old_name)) + '.html'
+
+    with open(old_html_path, 'w') as f:
+        print(f"""\
+<!DOCTYPE html>
+
+<html lang="en">
+<head>
+    <title>This page has moved</title>
+    <meta http-equiv="refresh" content="0; url={relpath}">
+</head>
+<body>
+<p>This page has moved to <a href="{relpath}">{new_name}</a>.</p>
+</body>
+</html>""", file=f)
diff --git a/tools/docs/gen-renames.py b/tools/docs/gen-renames.py
new file mode 100755
index 000000000000..8cb3b2157d83
--- /dev/null
+++ b/tools/docs/gen-renames.py
@@ -0,0 +1,130 @@
+#! /usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright © 2025, Oracle and/or its affiliates.
+# Author: Vegard Nossum <vegard.nossum@oracle.com>
+
+"""Trawl repository history for renames of Documentation/**.rst files.
+
+Example:
+
+    tools/docs/gen-renames.py --rev HEAD > Documentation/.renames.txt
+"""
+
+import argparse
+import itertools
+import os
+import subprocess
+import sys
+
+parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument('--rev', default='HEAD', help='generate renames up to this revision')
+
+args = parser.parse_args()
+
+def normalize(path):
+    prefix = 'Documentation/'
+    suffix = '.rst'
+
+    assert path.startswith(prefix)
+    assert path.endswith(suffix)
+
+    return path[len(prefix):-len(suffix)]
+
+class Name(object):
+    def __init__(self, name):
+        self.names = [name]
+
+    def rename(self, new_name):
+        self.names.append(new_name)
+
+names = {
+}
+
+for line in subprocess.check_output([
+    'git', 'log',
+    '--reverse',
+    '--oneline',
+    '--find-renames',
+    '--diff-filter=RD',
+    '--name-status',
+    '--format=commit %H',
+    # ~v4.8-ish is when Sphinx/.rst was added in the first place
+    f'v4.8..{args.rev}',
+    '--',
+    'Documentation/'
+], text=True).splitlines():
+    # rename
+    if line.startswith('R'):
+        _, old, new = line[1:].split('\t', 2)
+
+        if old.endswith('.rst') and new.endswith('.rst'):
+            old = normalize(old)
+            new = normalize(new)
+
+            name = names.get(old)
+            if name is None:
+                name = Name(old)
+            else:
+                del names[old]
+
+            name.rename(new)
+            names[new] = name
+
+        continue
+
+    # delete
+    if line.startswith('D'):
+        _, old = line.split('\t', 1)
+
+        if old.endswith('.rst'):
+            old = normalize(old)
+
+            # TODO: we could save added/modified files as well and propose
+            # them as alternatives
+            name = names.get(old)
+            if name is None:
+                pass
+            else:
+                del names[old]
+
+        continue
+
+#
+# Get the set of current files so we can sanity check that we aren't
+# redirecting any of those
+#
+
+current_files = set()
+for line in subprocess.check_output([
+    'git', 'ls-tree',
+    '-r',
+    '--name-only',
+    args.rev,
+    'Documentation/',
+], text=True).splitlines():
+    if line.endswith('.rst'):
+        current_files.add(normalize(line))
+
+#
+# Format/group/output result
+#
+
+result = []
+for _, v in names.items():
+    old_names = v.names[:-1]
+    new_name = v.names[-1]
+
+    for old_name in old_names:
+        if old_name == new_name:
+            # A file was renamed to its new name twice; don't redirect that
+            continue
+
+        if old_name in current_files:
+            # A file was recreated with a former name; don't redirect those
+            continue
+
+        result.append((old_name, new_name))
+
+for old_name, new_name in sorted(result):
+    print(f"{old_name} {new_name}")
diff --git a/tools/docs/lib/__init__.py b/tools/docs/lib/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/tools/docs/lib/__init__.py
diff --git a/tools/docs/lib/enrich_formatter.py b/tools/docs/lib/enrich_formatter.py
new file mode 100644
index 000000000000..bb171567a4ca
--- /dev/null
+++ b/tools/docs/lib/enrich_formatter.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
+
+"""
+Ancillary argparse HelpFormatter class that works on a similar way as
+argparse.RawDescriptionHelpFormatter, e.g. description maintains line
+breaks, but it also implement transformations to the help text. The
+actual transformations ar given by enrich_text(), if the output is tty.
+
+Currently, the follow transformations are done:
+
+    - Positional arguments are shown in upper cases;
+    - if output is TTY, ``var`` and positional arguments are shown prepended
+      by an ANSI SGR code. This is usually translated to bold. On some
+      terminals, like, konsole, this is translated into a colored bold text.
+"""
+
+import argparse
+import re
+import sys
+
+class EnrichFormatter(argparse.HelpFormatter):
+    """
+    Better format the output, making easier to identify the positional args
+    and how they're used at the __doc__ description.
+    """
+    def __init__(self, *args, **kwargs):
+        """Initialize class and check if is TTY"""
+        super().__init__(*args, **kwargs)
+        self._tty = sys.stdout.isatty()
+
+    def enrich_text(self, text):
+        """Handle ReST markups (currently, only ``foo``)"""
+        if self._tty and text:
+            # Replace ``text`` with ANSI SGR (bold)
+            return re.sub(r'\`\`(.+?)\`\`',
+                          lambda m: f'\033[1m{m.group(1)}\033[0m', text)
+        return text
+
+    def _fill_text(self, text, width, indent):
+        """Enrich descriptions with markups on it"""
+        enriched = self.enrich_text(text)
+        return "\n".join(indent + line for line in enriched.splitlines())
+
+    def _format_usage(self, usage, actions, groups, prefix):
+        """Enrich positional arguments at usage: line"""
+
+        prog = self._prog
+        parts = []
+
+        for action in actions:
+            if action.option_strings:
+                opt = action.option_strings[0]
+                if action.nargs != 0:
+                    opt += f" {action.dest.upper()}"
+                parts.append(f"[{opt}]")
+            else:
+                # Positional argument
+                parts.append(self.enrich_text(f"``{action.dest.upper()}``"))
+
+        usage_text = f"{prefix or 'usage: '} {prog} {' '.join(parts)}\n"
+        return usage_text
+
+    def _format_action_invocation(self, action):
+        """Enrich argument names"""
+        if not action.option_strings:
+            return self.enrich_text(f"``{action.dest.upper()}``")
+
+        return ", ".join(action.option_strings)
diff --git a/tools/docs/lib/parse_data_structs.py b/tools/docs/lib/parse_data_structs.py
new file mode 100755
index 000000000000..a5aa2e182052
--- /dev/null
+++ b/tools/docs/lib/parse_data_structs.py
@@ -0,0 +1,452 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
+# pylint: disable=R0912,R0915
+
+"""
+Parse a source file or header, creating ReStructured Text cross references.
+
+It accepts an optional file to change the default symbol reference or to
+suppress symbols from the output.
+
+It is capable of identifying defines, functions, structs, typedefs,
+enums and enum symbols and create cross-references for all of them.
+It is also capable of distinguish #define used for specifying a Linux
+ioctl.
+
+The optional rules file contains a set of rules like:
+
+    ignore ioctl VIDIOC_ENUM_FMT
+    replace ioctl VIDIOC_DQBUF vidioc_qbuf
+    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
+"""
+
+import os
+import re
+import sys
+
+
+class ParseDataStructs:
+    """
+    Creates an enriched version of a Kernel header file with cross-links
+    to each C data structure type.
+
+    It is meant to allow having a more comprehensive documentation, where
+    uAPI headers will create cross-reference links to the code.
+
+    It is capable of identifying defines, functions, structs, typedefs,
+    enums and enum symbols and create cross-references for all of them.
+    It is also capable of distinguish #define used for specifying a Linux
+    ioctl.
+
+    By default, it create rules for all symbols and defines, but it also
+    allows parsing an exception file. Such file contains a set of rules
+    using the syntax below:
+
+    1. Ignore rules:
+
+        ignore <type> <symbol>`
+
+    Removes the symbol from reference generation.
+
+    2. Replace rules:
+
+        replace <type> <old_symbol> <new_reference>
+
+    Replaces how old_symbol with a new reference. The new_reference can be:
+        - A simple symbol name;
+        - A full Sphinx reference.
+
+    On both cases, <type> can be:
+        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
+        - define: for other defines
+        - symbol: for symbols defined within enums;
+        - typedef: for typedefs;
+        - enum: for the name of a non-anonymous enum;
+        - struct: for structs.
+
+    Examples:
+
+        ignore define __LINUX_MEDIA_H
+        ignore ioctl VIDIOC_ENUM_FMT
+        replace ioctl VIDIOC_DQBUF vidioc_qbuf
+        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
+    """
+
+    # Parser regexes with multiple ways to capture enums and structs
+    RE_ENUMS = [
+        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
+        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
+        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
+        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
+    ]
+    RE_STRUCTS = [
+        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
+        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
+        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
+        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
+    ]
+
+    # FIXME: the original code was written a long time before Sphinx C
+    # domain to have multiple namespaces. To avoid to much turn at the
+    # existing hyperlinks, the code kept using "c:type" instead of the
+    # right types. To change that, we need to change the types not only
+    # here, but also at the uAPI media documentation.
+    DEF_SYMBOL_TYPES = {
+        "ioctl": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":ref",
+            "description": "IOCTL Commands",
+        },
+        "define": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":ref",
+            "description": "Macros and Definitions",
+        },
+        # We're calling each definition inside an enum as "symbol"
+        "symbol": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":ref",
+            "description": "Enumeration values",
+        },
+        "typedef": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":c:type",
+            "description": "Type Definitions",
+        },
+        # This is the description of the enum itself
+        "enum": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":c:type",
+            "description": "Enumerations",
+        },
+        "struct": {
+            "prefix": "\\ ",
+            "suffix": "\\ ",
+            "ref_type": ":c:type",
+            "description": "Structures",
+        },
+    }
+
+    def __init__(self, debug: bool = False):
+        """Initialize internal vars"""
+        self.debug = debug
+        self.data = ""
+
+        self.symbols = {}
+
+        for symbol_type in self.DEF_SYMBOL_TYPES:
+            self.symbols[symbol_type] = {}
+
+    def store_type(self, symbol_type: str, symbol: str,
+                   ref_name: str = None, replace_underscores: bool = True):
+        """
+        Stores a new symbol at self.symbols under symbol_type.
+
+        By default, underscores are replaced by "-"
+        """
+        defs = self.DEF_SYMBOL_TYPES[symbol_type]
+
+        prefix = defs.get("prefix", "")
+        suffix = defs.get("suffix", "")
+        ref_type = defs.get("ref_type")
+
+        # Determine ref_link based on symbol type
+        if ref_type:
+            if symbol_type == "enum":
+                ref_link = f"{ref_type}:`{symbol}`"
+            else:
+                if not ref_name:
+                    ref_name = symbol.lower()
+
+                # c-type references don't support hash
+                if ref_type == ":ref" and replace_underscores:
+                    ref_name = ref_name.replace("_", "-")
+
+                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
+        else:
+            ref_link = symbol
+
+        self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}"
+
+    def store_line(self, line):
+        """Stores a line at self.data, properly indented"""
+        line = "    " + line.expandtabs()
+        self.data += line.rstrip(" ")
+
+    def parse_file(self, file_in: str):
+        """Reads a C source file and get identifiers"""
+        self.data = ""
+        is_enum = False
+        is_comment = False
+        multiline = ""
+
+        with open(file_in, "r",
+                  encoding="utf-8", errors="backslashreplace") as f:
+            for line_no, line in enumerate(f):
+                self.store_line(line)
+                line = line.strip("\n")
+
+                # Handle continuation lines
+                if line.endswith(r"\\"):
+                    multiline += line[-1]
+                    continue
+
+                if multiline:
+                    line = multiline + line
+                    multiline = ""
+
+                # Handle comments. They can be multilined
+                if not is_comment:
+                    if re.search(r"/\*.*", line):
+                        is_comment = True
+                    else:
+                        # Strip C99-style comments
+                        line = re.sub(r"(//.*)", "", line)
+
+                if is_comment:
+                    if re.search(r".*\*/", line):
+                        is_comment = False
+                    else:
+                        multiline = line
+                        continue
+
+                # At this point, line variable may be a multilined statement,
+                # if lines end with \ or if they have multi-line comments
+                # With that, it can safely remove the entire comments,
+                # and there's no need to use re.DOTALL for the logic below
+
+                line = re.sub(r"(/\*.*\*/)", "", line)
+                if not line.strip():
+                    continue
+
+                # It can be useful for debug purposes to print the file after
+                # having comments stripped and multi-lines grouped.
+                if self.debug > 1:
+                    print(f"line {line_no + 1}: {line}")
+
+                # Now the fun begins: parse each type and store it.
+
+                # We opted for a two parsing logic here due to:
+                # 1. it makes easier to debug issues not-parsed symbols;
+                # 2. we want symbol replacement at the entire content, not
+                #    just when the symbol is detected.
+
+                if is_enum:
+                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
+                    if match:
+                        self.store_type("symbol", match.group(1))
+                    if "}" in line:
+                        is_enum = False
+                    continue
+
+                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
+                if match:
+                    self.store_type("ioctl", match.group(1),
+                                    replace_underscores=False)
+                    continue
+
+                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
+                if match:
+                    self.store_type("define", match.group(1))
+                    continue
+
+                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
+                                 line)
+                if match:
+                    name = match.group(2).strip()
+                    symbol = match.group(3)
+                    self.store_type("typedef", symbol, ref_name=name)
+                    continue
+
+                for re_enum in self.RE_ENUMS:
+                    match = re_enum.match(line)
+                    if match:
+                        self.store_type("enum", match.group(1))
+                        is_enum = True
+                        break
+
+                for re_struct in self.RE_STRUCTS:
+                    match = re_struct.match(line)
+                    if match:
+                        self.store_type("struct", match.group(1))
+                        break
+
+    def process_exceptions(self, fname: str):
+        """
+        Process exceptions file with rules to ignore or replace references.
+        """
+        if not fname:
+            return
+
+        name = os.path.basename(fname)
+
+        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
+            for ln, line in enumerate(f):
+                ln += 1
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+
+                # Handle ignore rules
+                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
+                if match:
+                    c_type = match.group(1)
+                    symbol = match.group(2)
+
+                    if c_type not in self.DEF_SYMBOL_TYPES:
+                        sys.exit(f"{name}:{ln}: {c_type} is invalid")
+
+                    d = self.symbols[c_type]
+                    if symbol in d:
+                        del d[symbol]
+
+                    continue
+
+                # Handle replace rules
+                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
+                if not match:
+                    sys.exit(f"{name}:{ln}: invalid line: {line}")
+
+                c_type, old, new = match.groups()
+
+                if c_type not in self.DEF_SYMBOL_TYPES:
+                    sys.exit(f"{name}:{ln}: {c_type} is invalid")
+
+                reftype = None
+
+                # Parse reference type when the type is specified
+
+                match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new)
+                if match:
+                    reftype = f":c:{match.group(1)}"
+                    new = match.group(2)
+                else:
+                    match = re.search(r"(\:ref)\:\`(.+)\`", new)
+                    if match:
+                        reftype = match.group(1)
+                        new = match.group(2)
+
+                # If the replacement rule doesn't have a type, get default
+                if not reftype:
+                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
+                    if not reftype:
+                        reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
+
+                new_ref = f"{reftype}:`{old} <{new}>`"
+
+                # Change self.symbols to use the replacement rule
+                if old in self.symbols[c_type]:
+                    self.symbols[c_type][old] = new_ref
+                else:
+                    print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
+
+    def debug_print(self):
+        """
+        Print debug information containing the replacement rules per symbol.
+        To make easier to check, group them per type.
+        """
+        if not self.debug:
+            return
+
+        for c_type, refs in self.symbols.items():
+            if not refs:  # Skip empty dictionaries
+                continue
+
+            print(f"{c_type}:")
+
+            for symbol, ref in sorted(refs.items()):
+                print(f"  {symbol} -> {ref}")
+
+            print()
+
+    def gen_output(self):
+        """Write the formatted output to a file."""
+
+        # Avoid extra blank lines
+        text = re.sub(r"\s+$", "", self.data) + "\n"
+        text = re.sub(r"\n\s+\n", "\n\n", text)
+
+        # Escape Sphinx special characters
+        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
+
+        # Source uAPI files may have special notes. Use bold font for them
+        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
+
+        # Delimiters to catch the entire symbol after escaped
+        start_delim = r"([ \n\t\(=\*\@])"
+        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
+
+        # Process all reference types
+        for ref_dict in self.symbols.values():
+            for symbol, replacement in ref_dict.items():
+                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
+                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
+                              fr'\1{replacement}\2', text)
+
+        # Remove "\ " where not needed: before spaces and at the end of lines
+        text = re.sub(r"\\ ([\n ])", r"\1", text)
+        text = re.sub(r" \\ ", " ", text)
+
+        return text
+
+    def gen_toc(self):
+        """
+        Create a TOC table pointing to each symbol from the header
+        """
+        text = []
+
+        # Add header
+        text.append(".. contents:: Table of Contents")
+        text.append("   :depth: 2")
+        text.append("   :local:")
+        text.append("")
+
+        # Sort symbol types per description
+        symbol_descriptions = []
+        for k, v in self.DEF_SYMBOL_TYPES.items():
+            symbol_descriptions.append((v['description'], k))
+
+        symbol_descriptions.sort()
+
+        # Process each category
+        for description, c_type in symbol_descriptions:
+
+            refs = self.symbols[c_type]
+            if not refs:  # Skip empty categories
+                continue
+
+            text.append(f"{description}")
+            text.append("-" * len(description))
+            text.append("")
+
+            # Sort symbols alphabetically
+            for symbol, ref in sorted(refs.items()):
+                text.append(f"* :{ref}:")
+
+            text.append("")  # Add empty line between categories
+
+        return "\n".join(text)
+
+    def write_output(self, file_in: str, file_out: str, toc: bool):
+        title = os.path.basename(file_in)
+
+        if toc:
+            text = self.gen_toc()
+        else:
+            text = self.gen_output()
+
+        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
+            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
+            f.write(f"{title}\n")
+            f.write("=" * len(title) + "\n\n")
+
+            if not toc:
+                f.write(".. parsed-literal::\n\n")
+
+            f.write(text)
diff --git a/tools/docs/parse-headers.py b/tools/docs/parse-headers.py
new file mode 100755
index 000000000000..bfa4e46a53e3
--- /dev/null
+++ b/tools/docs/parse-headers.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2016, 2025 by Mauro Carvalho Chehab <mchehab@kernel.org>.
+# pylint: disable=C0103
+
+"""
+Convert a C header or source file ``FILE_IN``, into a ReStructured Text
+included via ..parsed-literal block with cross-references for the
+documentation files that describe the API. It accepts an optional
+``FILE_RULES`` file to describes what elements will be either ignored or
+be pointed to a non-default reference type/name.
+
+The output is written at ``FILE_OUT``.
+
+It is capable of identifying defines, functions, structs, typedefs,
+enums and enum symbols and create cross-references for all of them.
+It is also capable of distinguish #define used for specifying a Linux
+ioctl.
+
+The optional ``FILE_RULES`` contains a set of rules like:
+
+    ignore ioctl VIDIOC_ENUM_FMT
+    replace ioctl VIDIOC_DQBUF vidioc_qbuf
+    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
+"""
+
+import argparse
+
+from lib.parse_data_structs import ParseDataStructs
+from lib.enrich_formatter import EnrichFormatter
+
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=EnrichFormatter)
+
+    parser.add_argument("-d", "--debug", action="count", default=0,
+                        help="Increase debug level. Can be used multiple times")
+    parser.add_argument("-t", "--toc", action="store_true",
+                        help="instead of a literal block, outputs a TOC table at the RST file")
+
+    parser.add_argument("file_in", help="Input C file")
+    parser.add_argument("file_out", help="Output RST file")
+    parser.add_argument("file_rules", nargs="?",
+                        help="Exceptions file (optional)")
+
+    args = parser.parse_args()
+
+    parser = ParseDataStructs(debug=args.debug)
+    parser.parse_file(args.file_in)
+
+    if args.file_rules:
+        parser.process_exceptions(args.file_rules)
+
+    parser.debug_print()
+    parser.write_output(args.file_in, args.file_out, args.toc)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index e2cd558ca0b4..c80204bb72a2 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,4 +1,4 @@
-libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_utils.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \
 	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
 	    usdt.o zip.o elf.o features.o btf_iter.o btf_relocate.o
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 37682908cb0f..18907f0fcf9f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -23,7 +23,6 @@
 #include "libbpf_internal.h"
 #include "hashmap.h"
 #include "strset.h"
-#include "str_error.h"
 
 #define BTF_MAX_NR_TYPES 0x7fffffffU
 #define BTF_MAX_STR_OFFSET 0x7fffffffU
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index f09f25eccf3c..6388392f49a0 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -21,7 +21,6 @@
 #include "hashmap.h"
 #include "libbpf.h"
 #include "libbpf_internal.h"
-#include "str_error.h"
 
 static const char PREFIXES[] = "\t\t\t\t\t\t\t\t\t\t\t\t\t";
 static const size_t PREFIX_CNT = sizeof(PREFIXES) - 1;
diff --git a/tools/lib/bpf/elf.c b/tools/lib/bpf/elf.c
index 823f83ad819c..295dbda24580 100644
--- a/tools/lib/bpf/elf.c
+++ b/tools/lib/bpf/elf.c
@@ -9,7 +9,6 @@
 #include <linux/kernel.h>
 
 #include "libbpf_internal.h"
-#include "str_error.h"
 
 /* A SHT_GNU_versym section holds 16-bit words. This bit is set if
  * the symbol is hidden and can only be seen when referenced using an
diff --git a/tools/lib/bpf/features.c b/tools/lib/bpf/features.c
index 760657f5224c..b842b83e2480 100644
--- a/tools/lib/bpf/features.c
+++ b/tools/lib/bpf/features.c
@@ -6,7 +6,6 @@
 #include "libbpf.h"
 #include "libbpf_common.h"
 #include "libbpf_internal.h"
-#include "str_error.h"
 
 static inline __u64 ptr_to_u64(const void *ptr)
 {
diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c
index 6945dd99a846..cd5c2543f54d 100644
--- a/tools/lib/bpf/gen_loader.c
+++ b/tools/lib/bpf/gen_loader.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <asm/byteorder.h>
 #include <linux/filter.h>
 #include <sys/param.h>
 #include "btf.h"
@@ -13,8 +14,6 @@
 #include "hashmap.h"
 #include "bpf_gen_internal.h"
 #include "skel_internal.h"
-#include <asm/byteorder.h>
-#include "str_error.h"
 
 #define MAX_USED_MAPS	64
 #define MAX_USED_PROGS	32
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f92083f51bdb..dd3b2f57082d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -35,7 +35,6 @@
 #include <linux/perf_event.h>
 #include <linux/bpf_perf_event.h>
 #include <linux/ring_buffer.h>
-#include <linux/unaligned.h>
 #include <sys/epoll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
@@ -51,7 +50,6 @@
 #include "libbpf.h"
 #include "bpf.h"
 #include "btf.h"
-#include "str_error.h"
 #include "libbpf_internal.h"
 #include "hashmap.h"
 #include "bpf_gen_internal.h"
@@ -319,8 +317,6 @@ static void pr_perm_msg(int err)
 		buf);
 }
 
-#define STRERR_BUFSIZE  128
-
 /* Copied from tools/perf/util/util.h */
 #ifndef zfree
 # define zfree(ptr) ({ free(*ptr); *ptr = NULL; })
@@ -14285,100 +14281,3 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s)
 	free(s->progs);
 	free(s);
 }
-
-static inline __u32 ror32(__u32 v, int bits)
-{
-	return (v >> bits) | (v << (32 - bits));
-}
-
-#define SHA256_BLOCK_LENGTH 64
-#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
-#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
-#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
-#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
-#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
-#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
-
-static const __u32 sha256_K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
-	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
-	0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
-	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
-	0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
-	0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                                \
-	{                                                                      \
-		__u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \
-		d += tmp;                                                      \
-		h = tmp + Sigma_0(a) + Maj(a, b, c);                           \
-	}
-
-static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks)
-{
-	while (nblocks--) {
-		__u32 a = state[0];
-		__u32 b = state[1];
-		__u32 c = state[2];
-		__u32 d = state[3];
-		__u32 e = state[4];
-		__u32 f = state[5];
-		__u32 g = state[6];
-		__u32 h = state[7];
-		__u32 w[64];
-		int i;
-
-		for (i = 0; i < 16; i++)
-			w[i] = get_unaligned_be32(&data[4 * i]);
-		for (; i < ARRAY_SIZE(w); i++)
-			w[i] = sigma_1(w[i - 2]) + w[i - 7] +
-			       sigma_0(w[i - 15]) + w[i - 16];
-		for (i = 0; i < ARRAY_SIZE(w); i += 8) {
-			SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
-			SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
-			SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
-			SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
-			SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
-			SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
-			SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
-			SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
-		}
-		state[0] += a;
-		state[1] += b;
-		state[2] += c;
-		state[3] += d;
-		state[4] += e;
-		state[5] += f;
-		state[6] += g;
-		state[7] += h;
-		data += SHA256_BLOCK_LENGTH;
-	}
-}
-
-void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH])
-{
-	__u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-			   0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-	const __be64 bitcount = cpu_to_be64((__u64)len * 8);
-	__u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 };
-	size_t final_len = len % SHA256_BLOCK_LENGTH;
-	int i;
-
-	sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH);
-
-	memcpy(final_data, data + len - final_len, final_len);
-	final_data[final_len] = 0x80;
-	final_len = round_up(final_len + 9, SHA256_BLOCK_LENGTH);
-	memcpy(&final_data[final_len - 8], &bitcount, 8);
-
-	sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH);
-
-	for (i = 0; i < ARRAY_SIZE(state); i++)
-		put_unaligned_be32(state[i], &out[4 * i]);
-}
diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c
deleted file mode 100644
index 6b180172ec6b..000000000000
--- a/tools/lib/bpf/libbpf_errno.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-
-/*
- * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
- * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
- * Copyright (C) 2015 Huawei Inc.
- * Copyright (C) 2017 Nicira, Inc.
- */
-
-#undef _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-
-#include "libbpf.h"
-#include "libbpf_internal.h"
-
-/* make sure libbpf doesn't use kernel-only integer typedefs */
-#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
-
-#define ERRNO_OFFSET(e)		((e) - __LIBBPF_ERRNO__START)
-#define ERRCODE_OFFSET(c)	ERRNO_OFFSET(LIBBPF_ERRNO__##c)
-#define NR_ERRNO	(__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
-
-static const char *libbpf_strerror_table[NR_ERRNO] = {
-	[ERRCODE_OFFSET(LIBELF)]	= "Something wrong in libelf",
-	[ERRCODE_OFFSET(FORMAT)]	= "BPF object format invalid",
-	[ERRCODE_OFFSET(KVERSION)]	= "'version' section incorrect or lost",
-	[ERRCODE_OFFSET(ENDIAN)]	= "Endian mismatch",
-	[ERRCODE_OFFSET(INTERNAL)]	= "Internal error in libbpf",
-	[ERRCODE_OFFSET(RELOC)]		= "Relocation failed",
-	[ERRCODE_OFFSET(VERIFY)]	= "Kernel verifier blocks program loading",
-	[ERRCODE_OFFSET(PROG2BIG)]	= "Program too big",
-	[ERRCODE_OFFSET(KVER)]		= "Incorrect kernel version",
-	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
-	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
-	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
-	[ERRCODE_OFFSET(NLPARSE)]	= "Incorrect netlink message parsing",
-};
-
-int libbpf_strerror(int err, char *buf, size_t size)
-{
-	int ret;
-
-	if (!buf || !size)
-		return libbpf_err(-EINVAL);
-
-	err = err > 0 ? err : -err;
-
-	if (err < __LIBBPF_ERRNO__START) {
-		ret = strerror_r(err, buf, size);
-		buf[size - 1] = '\0';
-		return libbpf_err_errno(ret);
-	}
-
-	if (err < __LIBBPF_ERRNO__END) {
-		const char *msg;
-
-		msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
-		ret = snprintf(buf, size, "%s", msg);
-		buf[size - 1] = '\0';
-		/* The length of the buf and msg is positive.
-		 * A negative number may be returned only when the
-		 * size exceeds INT_MAX. Not likely to appear.
-		 */
-		if (ret >= size)
-			return libbpf_err(-ERANGE);
-		return 0;
-	}
-
-	ret = snprintf(buf, size, "Unknown libbpf error %d", err);
-	buf[size - 1] = '\0';
-	if (ret >= size)
-		return libbpf_err(-ERANGE);
-	return libbpf_err(-ENOENT);
-}
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index c93797dcaf5b..35b2527bedec 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -172,6 +172,16 @@ do {				\
 #define pr_info(fmt, ...)	__pr(LIBBPF_INFO, fmt, ##__VA_ARGS__)
 #define pr_debug(fmt, ...)	__pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__)
 
+/**
+ * @brief **libbpf_errstr()** returns string corresponding to numeric errno
+ * @param err negative numeric errno
+ * @return pointer to string representation of the errno, that is invalidated
+ * upon the next call.
+ */
+const char *libbpf_errstr(int err);
+
+#define errstr(err) libbpf_errstr(err)
+
 #ifndef __has_builtin
 #define __has_builtin(x) 0
 #endif
@@ -712,6 +722,11 @@ static inline bool is_pow_of_2(size_t x)
 	return x && (x & (x - 1)) == 0;
 }
 
+static inline __u32 ror32(__u32 v, int bits)
+{
+	return (v >> bits) | (v << (32 - bits));
+}
+
 #define PROG_LOAD_ATTEMPTS 5
 int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts);
 
diff --git a/tools/lib/bpf/libbpf_utils.c b/tools/lib/bpf/libbpf_utils.c
new file mode 100644
index 000000000000..5d66bc6ff098
--- /dev/null
+++ b/tools/lib/bpf/libbpf_utils.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * Copyright (C) 2013-2015 Alexei Starovoitov <ast@kernel.org>
+ * Copyright (C) 2015 Wang Nan <wangnan0@huawei.com>
+ * Copyright (C) 2015 Huawei Inc.
+ * Copyright (C) 2017 Nicira, Inc.
+ */
+
+#undef _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/kernel.h>
+
+#include "libbpf.h"
+#include "libbpf_internal.h"
+
+#ifndef ENOTSUPP
+#define ENOTSUPP	524
+#endif
+
+/* make sure libbpf doesn't use kernel-only integer typedefs */
+#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
+
+#define ERRNO_OFFSET(e)		((e) - __LIBBPF_ERRNO__START)
+#define ERRCODE_OFFSET(c)	ERRNO_OFFSET(LIBBPF_ERRNO__##c)
+#define NR_ERRNO	(__LIBBPF_ERRNO__END - __LIBBPF_ERRNO__START)
+
+static const char *libbpf_strerror_table[NR_ERRNO] = {
+	[ERRCODE_OFFSET(LIBELF)]	= "Something wrong in libelf",
+	[ERRCODE_OFFSET(FORMAT)]	= "BPF object format invalid",
+	[ERRCODE_OFFSET(KVERSION)]	= "'version' section incorrect or lost",
+	[ERRCODE_OFFSET(ENDIAN)]	= "Endian mismatch",
+	[ERRCODE_OFFSET(INTERNAL)]	= "Internal error in libbpf",
+	[ERRCODE_OFFSET(RELOC)]		= "Relocation failed",
+	[ERRCODE_OFFSET(VERIFY)]	= "Kernel verifier blocks program loading",
+	[ERRCODE_OFFSET(PROG2BIG)]	= "Program too big",
+	[ERRCODE_OFFSET(KVER)]		= "Incorrect kernel version",
+	[ERRCODE_OFFSET(PROGTYPE)]	= "Kernel doesn't support this program type",
+	[ERRCODE_OFFSET(WRNGPID)]	= "Wrong pid in netlink message",
+	[ERRCODE_OFFSET(INVSEQ)]	= "Invalid netlink sequence",
+	[ERRCODE_OFFSET(NLPARSE)]	= "Incorrect netlink message parsing",
+};
+
+int libbpf_strerror(int err, char *buf, size_t size)
+{
+	int ret;
+
+	if (!buf || !size)
+		return libbpf_err(-EINVAL);
+
+	err = err > 0 ? err : -err;
+
+	if (err < __LIBBPF_ERRNO__START) {
+		ret = strerror_r(err, buf, size);
+		buf[size - 1] = '\0';
+		return libbpf_err_errno(ret);
+	}
+
+	if (err < __LIBBPF_ERRNO__END) {
+		const char *msg;
+
+		msg = libbpf_strerror_table[ERRNO_OFFSET(err)];
+		ret = snprintf(buf, size, "%s", msg);
+		buf[size - 1] = '\0';
+		/* The length of the buf and msg is positive.
+		 * A negative number may be returned only when the
+		 * size exceeds INT_MAX. Not likely to appear.
+		 */
+		if (ret >= size)
+			return libbpf_err(-ERANGE);
+		return 0;
+	}
+
+	ret = snprintf(buf, size, "Unknown libbpf error %d", err);
+	buf[size - 1] = '\0';
+	if (ret >= size)
+		return libbpf_err(-ERANGE);
+	return libbpf_err(-ENOENT);
+}
+
+const char *libbpf_errstr(int err)
+{
+	static __thread char buf[12];
+
+	if (err > 0)
+		err = -err;
+
+	switch (err) {
+	case -E2BIG:		return "-E2BIG";
+	case -EACCES:		return "-EACCES";
+	case -EADDRINUSE:	return "-EADDRINUSE";
+	case -EADDRNOTAVAIL:	return "-EADDRNOTAVAIL";
+	case -EAGAIN:		return "-EAGAIN";
+	case -EALREADY:		return "-EALREADY";
+	case -EBADF:		return "-EBADF";
+	case -EBADFD:		return "-EBADFD";
+	case -EBUSY:		return "-EBUSY";
+	case -ECANCELED:	return "-ECANCELED";
+	case -ECHILD:		return "-ECHILD";
+	case -EDEADLK:		return "-EDEADLK";
+	case -EDOM:		return "-EDOM";
+	case -EEXIST:		return "-EEXIST";
+	case -EFAULT:		return "-EFAULT";
+	case -EFBIG:		return "-EFBIG";
+	case -EILSEQ:		return "-EILSEQ";
+	case -EINPROGRESS:	return "-EINPROGRESS";
+	case -EINTR:		return "-EINTR";
+	case -EINVAL:		return "-EINVAL";
+	case -EIO:		return "-EIO";
+	case -EISDIR:		return "-EISDIR";
+	case -ELOOP:		return "-ELOOP";
+	case -EMFILE:		return "-EMFILE";
+	case -EMLINK:		return "-EMLINK";
+	case -EMSGSIZE:		return "-EMSGSIZE";
+	case -ENAMETOOLONG:	return "-ENAMETOOLONG";
+	case -ENFILE:		return "-ENFILE";
+	case -ENODATA:		return "-ENODATA";
+	case -ENODEV:		return "-ENODEV";
+	case -ENOENT:		return "-ENOENT";
+	case -ENOEXEC:		return "-ENOEXEC";
+	case -ENOLINK:		return "-ENOLINK";
+	case -ENOMEM:		return "-ENOMEM";
+	case -ENOSPC:		return "-ENOSPC";
+	case -ENOTBLK:		return "-ENOTBLK";
+	case -ENOTDIR:		return "-ENOTDIR";
+	case -ENOTSUPP:		return "-ENOTSUPP";
+	case -ENOTTY:		return "-ENOTTY";
+	case -ENXIO:		return "-ENXIO";
+	case -EOPNOTSUPP:	return "-EOPNOTSUPP";
+	case -EOVERFLOW:	return "-EOVERFLOW";
+	case -EPERM:		return "-EPERM";
+	case -EPIPE:		return "-EPIPE";
+	case -EPROTO:		return "-EPROTO";
+	case -EPROTONOSUPPORT:	return "-EPROTONOSUPPORT";
+	case -ERANGE:		return "-ERANGE";
+	case -EROFS:		return "-EROFS";
+	case -ESPIPE:		return "-ESPIPE";
+	case -ESRCH:		return "-ESRCH";
+	case -ETXTBSY:		return "-ETXTBSY";
+	case -EUCLEAN:		return "-EUCLEAN";
+	case -EXDEV:		return "-EXDEV";
+	default:
+		snprintf(buf, sizeof(buf), "%d", err);
+		return buf;
+	}
+}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpacked"
+#pragma GCC diagnostic ignored "-Wattributes"
+struct __packed_u32 { __u32 __val; } __attribute__((packed));
+#pragma GCC diagnostic pop
+
+#define get_unaligned_be32(p) be32_to_cpu((((struct __packed_u32 *)(p))->__val))
+#define put_unaligned_be32(v, p) do {							\
+	((struct __packed_u32 *)(p))->__val = cpu_to_be32(v);				\
+} while (0)
+
+#define SHA256_BLOCK_LENGTH 64
+#define Ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
+#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+#define Sigma_0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define Sigma_1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define sigma_0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define sigma_1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
+
+static const __u32 sha256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+	0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+	0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+	0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                                \
+	{                                                                      \
+		__u32 tmp = h + Sigma_1(e) + Ch(e, f, g) + sha256_K[i] + w[i]; \
+		d += tmp;                                                      \
+		h = tmp + Sigma_0(a) + Maj(a, b, c);                           \
+	}
+
+static void sha256_blocks(__u32 state[8], const __u8 *data, size_t nblocks)
+{
+	while (nblocks--) {
+		__u32 a = state[0];
+		__u32 b = state[1];
+		__u32 c = state[2];
+		__u32 d = state[3];
+		__u32 e = state[4];
+		__u32 f = state[5];
+		__u32 g = state[6];
+		__u32 h = state[7];
+		__u32 w[64];
+		int i;
+
+		for (i = 0; i < 16; i++)
+			w[i] = get_unaligned_be32(&data[4 * i]);
+		for (; i < ARRAY_SIZE(w); i++)
+			w[i] = sigma_1(w[i - 2]) + w[i - 7] +
+			       sigma_0(w[i - 15]) + w[i - 16];
+		for (i = 0; i < ARRAY_SIZE(w); i += 8) {
+			SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
+			SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
+			SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
+			SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
+			SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
+			SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
+			SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
+			SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
+		}
+		state[0] += a;
+		state[1] += b;
+		state[2] += c;
+		state[3] += d;
+		state[4] += e;
+		state[5] += f;
+		state[6] += g;
+		state[7] += h;
+		data += SHA256_BLOCK_LENGTH;
+	}
+}
+
+void libbpf_sha256(const void *data, size_t len, __u8 out[SHA256_DIGEST_LENGTH])
+{
+	__u32 state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+			   0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+	const __be64 bitcount = cpu_to_be64((__u64)len * 8);
+	__u8 final_data[2 * SHA256_BLOCK_LENGTH] = { 0 };
+	size_t final_len = len % SHA256_BLOCK_LENGTH;
+	int i;
+
+	sha256_blocks(state, data, len / SHA256_BLOCK_LENGTH);
+
+	memcpy(final_data, data + len - final_len, final_len);
+	final_data[final_len] = 0x80;
+	final_len = roundup(final_len + 9, SHA256_BLOCK_LENGTH);
+	memcpy(&final_data[final_len - 8], &bitcount, 8);
+
+	sha256_blocks(state, final_data, final_len / SHA256_BLOCK_LENGTH);
+
+	for (i = 0; i < ARRAY_SIZE(state); i++)
+		put_unaligned_be32(state[i], &out[4 * i]);
+}
diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c
index a469e5d4fee7..56ae77047bc3 100644
--- a/tools/lib/bpf/linker.c
+++ b/tools/lib/bpf/linker.c
@@ -25,7 +25,6 @@
 #include "btf.h"
 #include "libbpf_internal.h"
 #include "strset.h"
-#include "str_error.h"
 
 #define BTF_EXTERN_SEC ".extern"
 
diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c
index 2b83c98a1137..6eea5edba58a 100644
--- a/tools/lib/bpf/relo_core.c
+++ b/tools/lib/bpf/relo_core.c
@@ -64,7 +64,6 @@ enum libbpf_print_level {
 #include "libbpf.h"
 #include "bpf.h"
 #include "btf.h"
-#include "str_error.h"
 #include "libbpf_internal.h"
 #endif
 
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 9702b70da444..00ec4837a06d 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -21,7 +21,6 @@
 #include "libbpf.h"
 #include "libbpf_internal.h"
 #include "bpf.h"
-#include "str_error.h"
 
 struct ring {
 	ring_buffer_sample_fn sample_cb;
diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c
deleted file mode 100644
index 9a541762f54c..000000000000
--- a/tools/lib/bpf/str_error.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-#undef _GNU_SOURCE
-#include <string.h>
-#include <stdio.h>
-#include <errno.h>
-#include "str_error.h"
-
-#ifndef ENOTSUPP
-#define ENOTSUPP	524
-#endif
-
-/* make sure libbpf doesn't use kernel-only integer typedefs */
-#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
-
-/*
- * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl
- * libc, while checking strerror_r() return to avoid having to check this in
- * all places calling it.
- */
-char *libbpf_strerror_r(int err, char *dst, int len)
-{
-	int ret = strerror_r(err < 0 ? -err : err, dst, len);
-	/* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't
-	 * handle the error, on glibc >=2.13 *positive* (errno-like) error
-	 * code is returned directly
-	 */
-	if (ret == -1)
-		ret = errno;
-	if (ret) {
-		if (ret == EINVAL)
-			/* strerror_r() doesn't recognize this specific error */
-			snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err);
-		else
-			snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret);
-	}
-	return dst;
-}
-
-const char *libbpf_errstr(int err)
-{
-	static __thread char buf[12];
-
-	if (err > 0)
-		err = -err;
-
-	switch (err) {
-	case -E2BIG:		return "-E2BIG";
-	case -EACCES:		return "-EACCES";
-	case -EADDRINUSE:	return "-EADDRINUSE";
-	case -EADDRNOTAVAIL:	return "-EADDRNOTAVAIL";
-	case -EAGAIN:		return "-EAGAIN";
-	case -EALREADY:		return "-EALREADY";
-	case -EBADF:		return "-EBADF";
-	case -EBADFD:		return "-EBADFD";
-	case -EBUSY:		return "-EBUSY";
-	case -ECANCELED:	return "-ECANCELED";
-	case -ECHILD:		return "-ECHILD";
-	case -EDEADLK:		return "-EDEADLK";
-	case -EDOM:		return "-EDOM";
-	case -EEXIST:		return "-EEXIST";
-	case -EFAULT:		return "-EFAULT";
-	case -EFBIG:		return "-EFBIG";
-	case -EILSEQ:		return "-EILSEQ";
-	case -EINPROGRESS:	return "-EINPROGRESS";
-	case -EINTR:		return "-EINTR";
-	case -EINVAL:		return "-EINVAL";
-	case -EIO:		return "-EIO";
-	case -EISDIR:		return "-EISDIR";
-	case -ELOOP:		return "-ELOOP";
-	case -EMFILE:		return "-EMFILE";
-	case -EMLINK:		return "-EMLINK";
-	case -EMSGSIZE:		return "-EMSGSIZE";
-	case -ENAMETOOLONG:	return "-ENAMETOOLONG";
-	case -ENFILE:		return "-ENFILE";
-	case -ENODATA:		return "-ENODATA";
-	case -ENODEV:		return "-ENODEV";
-	case -ENOENT:		return "-ENOENT";
-	case -ENOEXEC:		return "-ENOEXEC";
-	case -ENOLINK:		return "-ENOLINK";
-	case -ENOMEM:		return "-ENOMEM";
-	case -ENOSPC:		return "-ENOSPC";
-	case -ENOTBLK:		return "-ENOTBLK";
-	case -ENOTDIR:		return "-ENOTDIR";
-	case -ENOTSUPP:		return "-ENOTSUPP";
-	case -ENOTTY:		return "-ENOTTY";
-	case -ENXIO:		return "-ENXIO";
-	case -EOPNOTSUPP:	return "-EOPNOTSUPP";
-	case -EOVERFLOW:	return "-EOVERFLOW";
-	case -EPERM:		return "-EPERM";
-	case -EPIPE:		return "-EPIPE";
-	case -EPROTO:		return "-EPROTO";
-	case -EPROTONOSUPPORT:	return "-EPROTONOSUPPORT";
-	case -ERANGE:		return "-ERANGE";
-	case -EROFS:		return "-EROFS";
-	case -ESPIPE:		return "-ESPIPE";
-	case -ESRCH:		return "-ESRCH";
-	case -ETXTBSY:		return "-ETXTBSY";
-	case -EUCLEAN:		return "-EUCLEAN";
-	case -EXDEV:		return "-EXDEV";
-	default:
-		snprintf(buf, sizeof(buf), "%d", err);
-		return buf;
-	}
-}
diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h
deleted file mode 100644
index 53e7fbffc13e..000000000000
--- a/tools/lib/bpf/str_error.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-#ifndef __LIBBPF_STR_ERROR_H
-#define __LIBBPF_STR_ERROR_H
-
-#define STRERR_BUFSIZE  128
-
-char *libbpf_strerror_r(int err, char *dst, int len);
-
-/**
- * @brief **libbpf_errstr()** returns string corresponding to numeric errno
- * @param err negative numeric errno
- * @return pointer to string representation of the errno, that is invalidated
- * upon the next call.
- */
-const char *libbpf_errstr(int err);
-
-#define errstr(err) libbpf_errstr(err)
-
-#endif /* __LIBBPF_STR_ERROR_H */
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index fc2785eecc17..c174b4086673 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -20,7 +20,6 @@
 #include "libbpf_common.h"
 #include "libbpf_internal.h"
 #include "hashmap.h"
-#include "str_error.h"
 
 /* libbpf's USDT support consists of BPF-side state/code and user-space
  * state/code working together in concert. BPF-side parts are defined in
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index d89eda3fd8a3..2cd9165c7348 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -219,7 +219,7 @@ extern void bpf_put_file(struct file *file) __ksym;
  *	including the NULL termination character, stored in the supplied
  *	buffer. On error, a negative integer is returned.
  */
-extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym;
+extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym;
 
 /* This macro must be used to mark the exception callback corresponding to the
  * main program. For example:
diff --git a/tools/testing/selftests/bpf/prog_tests/sha256.c b/tools/testing/selftests/bpf/prog_tests/sha256.c
new file mode 100644
index 000000000000..604a0b1423d5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sha256.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright 2025 Google LLC */
+
+#include <test_progs.h>
+#include "bpf/libbpf_internal.h"
+
+#define MAX_LEN 4096
+
+/* Test libbpf_sha256() for all lengths from 0 to MAX_LEN inclusively. */
+void test_sha256(void)
+{
+	/*
+	 * The correctness of this value was verified by running this test with
+	 * libbpf_sha256() replaced by OpenSSL's SHA256().
+	 */
+	static const __u8 expected_digest_of_digests[SHA256_DIGEST_LENGTH] = {
+		0x62, 0x30, 0x0e, 0x1d, 0xea, 0x7f, 0xc4, 0x74,
+		0xfd, 0x8e, 0x64, 0x0b, 0xd8, 0x5f, 0xea, 0x04,
+		0xf3, 0xef, 0x77, 0x42, 0xc2, 0x01, 0xb8, 0x90,
+		0x6e, 0x19, 0x91, 0x1b, 0xca, 0xb3, 0x28, 0x42,
+	};
+	__u64 seed = 0;
+	__u8 *data = NULL, *digests = NULL;
+	__u8 digest_of_digests[SHA256_DIGEST_LENGTH];
+	size_t i;
+
+	data = malloc(MAX_LEN);
+	if (!ASSERT_OK_PTR(data, "malloc"))
+		goto out;
+	digests = malloc((MAX_LEN + 1) * SHA256_DIGEST_LENGTH);
+	if (!ASSERT_OK_PTR(digests, "malloc"))
+		goto out;
+
+	/* Generate MAX_LEN bytes of "random" data deterministically. */
+	for (i = 0; i < MAX_LEN; i++) {
+		seed = (seed * 25214903917 + 11) & ((1ULL << 48) - 1);
+		data[i] = (__u8)(seed >> 16);
+	}
+
+	/* Calculate a digest for each length 0 through MAX_LEN inclusively. */
+	for (i = 0; i <= MAX_LEN; i++)
+		libbpf_sha256(data, i, &digests[i * SHA256_DIGEST_LENGTH]);
+
+	/* Calculate and verify the digest of all the digests. */
+	libbpf_sha256(digests, (MAX_LEN + 1) * SHA256_DIGEST_LENGTH,
+		      digest_of_digests);
+	ASSERT_MEMEQ(digest_of_digests, expected_digest_of_digests,
+		     SHA256_DIGEST_LENGTH, "digest_of_digests");
+out:
+	free(data);
+	free(digests);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 6d75ede16e7c..955a37751b52 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -661,7 +661,7 @@ static void *worker_trigger(void *arg)
 		rounds++;
 	}
 
-	printf("tid %d trigger rounds: %lu\n", gettid(), rounds);
+	printf("tid %ld trigger rounds: %lu\n", sys_gettid(), rounds);
 	return NULL;
 }
 
@@ -704,7 +704,7 @@ static void *worker_attach(void *arg)
 		rounds++;
 	}
 
-	printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed);
+	printf("tid %ld attach rounds: %lu hits: %d\n", sys_gettid(), rounds, skel->bss->executed);
 	uprobe_syscall_executed__destroy(skel);
 	free(ref);
 	return NULL;
diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
index 4f7f45e69315..f4be5269fa90 100644
--- a/tools/testing/selftests/bpf/prog_tests/usdt.c
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -142,7 +142,7 @@ static void subtest_basic_usdt(bool optimized)
 		goto cleanup;
 #endif
 
-	alled = TRIGGER(1);
+	called = TRIGGER(1);
 
 	ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
 	ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
index a9ab37d3b9e2..2129e4353fd9 100644
--- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
+++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c
@@ -3,6 +3,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include "../../../include/linux/filter.h"
 #include "bpf_misc.h"
 
 #define MAX_ENTRIES 11
@@ -146,6 +147,24 @@ l0_%=:	exit;						\
 	: __clobber_all);
 }
 
+SEC("socket")
+__description("map_ptr illegal alu op, map_ptr = -map_ptr")
+__failure __msg("R0 invalid mem access 'scalar'")
+__failure_unpriv __msg_unpriv("R0 pointer arithmetic prohibited")
+__flag(BPF_F_ANY_ALIGNMENT)
+__naked void map_ptr_illegal_alu_op(void)
+{
+	asm volatile ("					\
+	r0 = %[map_hash_48b] ll;			\
+	r0 = -r0;					\
+	r1 = 22;					\
+	*(u64*)(r0 + 0) = r1;				\
+	exit;						\
+"	:
+	: __imm_addr(map_hash_48b)
+	: __clobber_all);
+}
+
 SEC("flow_dissector")
 __description("flow_keys illegal alu op with variable offset")
 __failure __msg("R7 pointer arithmetic on flow_keys prohibited")
@@ -165,4 +184,32 @@ __naked void flow_keys_illegal_variable_offset_alu(void)
 	: __clobber_all);
 }
 
+#define DEFINE_BAD_OFFSET_TEST(name, op, off, imm)	\
+	SEC("socket")					\
+	__failure __msg("BPF_ALU uses reserved fields") \
+	__naked void name(void)				\
+	{						\
+		asm volatile(				\
+			"r0 = 1;"			\
+			".8byte %[insn];"		\
+			"r0 = 0;"			\
+			"exit;"				\
+		:					\
+		: __imm_insn(insn, BPF_RAW_INSN((op), 0, 0, (off), (imm))) \
+		: __clobber_all);			\
+	}
+
+/*
+ * Offset fields of 0 and 1 are legal for BPF_{DIV,MOD} instructions.
+ * Offset fields of 0 are legal for the rest of ALU instructions.
+ * Test that error is reported for illegal offsets, assuming that tests
+ * for legal offsets exist.
+ */
+DEFINE_BAD_OFFSET_TEST(bad_offset_divx, BPF_ALU64 | BPF_DIV | BPF_X, -1, 0)
+DEFINE_BAD_OFFSET_TEST(bad_offset_modk, BPF_ALU64 | BPF_MOD | BPF_K, -1, 1)
+DEFINE_BAD_OFFSET_TEST(bad_offset_addx, BPF_ALU64 | BPF_ADD | BPF_X, -1, 0)
+DEFINE_BAD_OFFSET_TEST(bad_offset_divx2, BPF_ALU64 | BPF_DIV | BPF_X, 2, 0)
+DEFINE_BAD_OFFSET_TEST(bad_offset_modk2, BPF_ALU64 | BPF_MOD | BPF_K, 2, 1)
+DEFINE_BAD_OFFSET_TEST(bad_offset_addx2, BPF_ALU64 | BPF_ADD | BPF_X, 1, 0)
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 171987627f3a..eeaab7013ca2 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -732,7 +732,7 @@ int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel)
 
 		if (cnt == max_cnt) {
 			max_cnt += inc_cnt;
-			tmp_addrs = realloc(addrs, max_cnt);
+			tmp_addrs = realloc(addrs, max_cnt * sizeof(long));
 			if (!tmp_addrs) {
 				err = -ENOMEM;
 				goto error;
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 3c3e08b8c90e..772ca1db6e59 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -1042,15 +1042,13 @@ static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents)
 			.dev_id = dev_id,
 		},
 	};
-	int ret;
 
 	while (nvevents--) {
-		ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT),
-			    &trigger_vevent_cmd);
-		if (ret < 0)
+		if (!ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT),
+			    &trigger_vevent_cmd))
 			return -1;
 	}
-	return ret;
+	return 0;
 }
 
 #define test_cmd_trigger_vevents(dev_id, nvevents) \
diff --git a/tools/virtio/linux/kmsan.h b/tools/virtio/linux/kmsan.h
index 272b5aa285d5..6cd2e3efd03d 100644
--- a/tools/virtio/linux/kmsan.h
+++ b/tools/virtio/linux/kmsan.h
@@ -4,7 +4,7 @@
 
 #include <linux/gfp.h>
 
-inline void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+inline void kmsan_handle_dma(phys_addr_t phys, size_t size,
 			     enum dma_data_direction dir)
 {
 }