summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap1
-rw-r--r--Documentation/ABI/stable/sysfs-block15
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd4
-rw-r--r--Documentation/admin-guide/blockdev/index.rst1
-rw-r--r--Documentation/admin-guide/blockdev/zoned_loop.rst169
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst32
-rw-r--r--Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml2
-rw-r--r--Documentation/driver-api/early-userspace/buffer-format.rst34
-rw-r--r--Documentation/filesystems/fscrypt.rst2
-rw-r--r--Documentation/filesystems/iomap/design.rst16
-rw-r--r--Documentation/filesystems/locking.rst54
-rw-r--r--Documentation/filesystems/mount_api.rst16
-rw-r--r--Documentation/filesystems/netfs_library.rst1016
-rw-r--r--Documentation/filesystems/porting.rst40
-rw-r--r--Documentation/filesystems/vfs.rst39
-rw-r--r--Documentation/netlink/specs/tc.yaml10
-rw-r--r--Documentation/networking/timestamping.rst8
-rw-r--r--MAINTAINERS101
-rw-r--r--Makefile2
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts38
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts14
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi22
-rw-r--r--arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi8
-rw-r--r--arch/arm64/configs/defconfig2
-rw-r--r--arch/loongarch/include/asm/ptrace.h2
-rw-r--r--arch/loongarch/include/asm/uprobes.h1
-rw-r--r--arch/loongarch/kernel/genex.S7
-rw-r--r--arch/loongarch/kernel/kfpu.c22
-rw-r--r--arch/loongarch/kernel/time.c2
-rw-r--r--arch/loongarch/kernel/uprobes.c11
-rw-r--r--arch/loongarch/power/hibernate.c3
-rw-r--r--arch/m68k/configs/amcore_defconfig1
-rw-r--r--arch/mips/configs/gcw0_defconfig1
-rw-r--r--arch/mips/include/asm/socket.h9
-rw-r--r--arch/s390/hypfs/inode.c2
-rw-r--r--arch/x86/coco/sev/core.c255
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/events/intel/ds.c9
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/sev-common.h2
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/xtensa/configs/cadence_csp_defconfig1
-rw-r--r--block/Kconfig8
-rw-r--r--block/Makefile5
-rw-r--r--block/bfq-iosched.c6
-rw-r--r--block/bio-integrity-auto.c62
-rw-r--r--block/bio-integrity.c4
-rw-r--r--block/bio.c160
-rw-r--r--block/blk-core.c2
-rw-r--r--block/blk-crypto-fallback.c1
-rw-r--r--block/blk-map.c93
-rw-r--r--block/blk-merge.c137
-rw-r--r--block/blk-mq-debugfs.c13
-rw-r--r--block/blk-mq-dma.c116
-rw-r--r--block/blk-mq-sched.c53
-rw-r--r--block/blk-mq.c309
-rw-r--r--block/blk-mq.h7
-rw-r--r--block/blk-rq-qos.c4
-rw-r--r--block/blk-rq-qos.h21
-rw-r--r--block/blk-settings.c5
-rw-r--r--block/blk-sysfs.c34
-rw-r--r--block/blk-throttle.c411
-rw-r--r--block/blk-throttle.h36
-rw-r--r--block/blk-wbt.c11
-rw-r--r--block/blk.h50
-rw-r--r--block/bounce.c267
-rw-r--r--block/elevator.c329
-rw-r--r--block/elevator.h6
-rw-r--r--block/fops.c28
-rw-r--r--block/genhd.c266
-rw-r--r--block/mq-deadline.c2
-rw-r--r--crypto/algif_hash.c4
-rw-r--r--drivers/accel/ivpu/ivpu_debugfs.c2
-rw-r--r--drivers/acpi/pptt.c11
-rw-r--r--drivers/android/binderfs.c4
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/block/Kconfig19
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/brd.c225
-rw-r--r--drivers/block/loop.c3
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rnbd/rnbd-srv.c7
-rw-r--r--drivers/block/ublk_drv.c571
-rw-r--r--drivers/block/virtio_blk.c4
-rw-r--r--drivers/block/zloop.c1385
-rw-r--r--drivers/bluetooth/btusb.c98
-rw-r--r--drivers/cdrom/cdrom.c3
-rw-r--r--drivers/clk/clk-s2mps11.c3
-rw-r--r--drivers/clk/rockchip/clk-rk3576.c2
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun20i-d1.c44
-rw-r--r--drivers/clk/sunxi-ng/ccu_mp.h25
-rw-r--r--drivers/dma-buf/dma-resv.c5
-rw-r--r--drivers/dma/amd/ptdma/ptdma-dmaengine.c19
-rw-r--r--drivers/dma/dmatest.c6
-rw-r--r--drivers/dma/fsl-edma-main.c2
-rw-r--r--drivers/dma/idxd/cdev.c13
-rw-r--r--drivers/dma/idxd/init.c159
-rw-r--r--drivers/dma/mediatek/mtk-cqdma.c6
-rw-r--r--drivers/dma/ti/k3-udma.c10
-rw-r--r--drivers/firmware/samsung/exynos-acpm.c44
-rw-r--r--drivers/gpio/gpio-pca953x.c6
-rw-r--r--drivers/gpio/gpio-virtuser.c12
-rw-r--r--drivers/gpio/gpiolib.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c8
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c10
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c16
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc.c10
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c20
-rw-r--r--drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c5
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c6
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/link_dpms.c13
-rw-r--r--drivers/gpu/drm/drm_edid.c1
-rw-r--r--drivers/gpu/drm/drm_gpusvm.c37
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_shmem.c32
-rw-r--r--drivers/gpu/drm/meson/meson_encoder_hdmi.c4
-rw-r--r--drivers/gpu/drm/tiny/panel-mipi-dbi.c5
-rw-r--r--drivers/gpu/drm/ttm/ttm_backup.c8
-rw-r--r--drivers/gpu/drm/xe/instructions/xe_mi_commands.h4
-rw-r--r--drivers/gpu/drm/xe/regs/xe_engine_regs.h5
-rw-r--r--drivers/gpu/drm/xe/regs/xe_gt_regs.h1
-rw-r--r--drivers/gpu/drm/xe/regs/xe_lrc_layout.h2
-rw-r--r--drivers/gpu/drm/xe/xe_device_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_exec_queue.c2
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c2
-rw-r--r--drivers/gpu/drm/xe/xe_lrc.c199
-rw-r--r--drivers/gpu/drm/xe/xe_lrc.h5
-rw-r--r--drivers/gpu/drm/xe/xe_lrc_types.h9
-rw-r--r--drivers/gpu/drm/xe/xe_mmio.c10
-rw-r--r--drivers/gpu/drm/xe/xe_mocs.c11
-rw-r--r--drivers/gpu/drm/xe/xe_module.c3
-rw-r--r--drivers/gpu/drm/xe/xe_module.h1
-rw-r--r--drivers/gpu/drm/xe/xe_pci.c2
-rw-r--r--drivers/gpu/drm/xe/xe_pci_types.h1
-rw-r--r--drivers/gpu/drm/xe/xe_pt.c14
-rw-r--r--drivers/gpu/drm/xe/xe_ring_ops.c7
-rw-r--r--drivers/gpu/drm/xe/xe_shrinker.c2
-rw-r--r--drivers/gpu/drm/xe/xe_svm.c116
-rw-r--r--drivers/gpu/drm/xe/xe_svm.h5
-rw-r--r--drivers/gpu/drm/xe/xe_trace_lrc.h8
-rw-r--r--drivers/gpu/drm/xe/xe_wa.c4
-rw-r--r--drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c12
-rw-r--r--drivers/hid/bpf/hid_bpf_dispatch.c9
-rw-r--r--drivers/hid/bpf/progs/XPPen__ACK05.bpf.c1
-rw-r--r--drivers/hid/hid-ids.h4
-rw-r--r--drivers/hid/hid-quirks.c2
-rw-r--r--drivers/hid/hid-steam.c2
-rw-r--r--drivers/hid/hid-thrustmaster.c1
-rw-r--r--drivers/hid/hid-uclogic-core.c7
-rw-r--r--drivers/hid/wacom_sys.c11
-rw-r--r--drivers/hv/channel.c65
-rw-r--r--drivers/i2c/busses/i2c-designware-pcidrv.c4
-rw-r--r--drivers/infiniband/core/device.c6
-rw-r--r--drivers/infiniband/hw/irdma/main.c4
-rw-r--r--drivers/infiniband/hw/irdma/verbs.c1
-rw-r--r--drivers/infiniband/hw/qib/qib_fs.c4
-rw-r--r--drivers/infiniband/sw/rxe/rxe_cq.c5
-rw-r--r--drivers/input/joystick/xpad.c3
-rw-r--r--drivers/input/rmi4/rmi_f34.c135
-rw-r--r--drivers/iommu/iommu.c43
-rw-r--r--drivers/irqchip/irq-gic-v2m.c2
-rw-r--r--drivers/irqchip/irq-gic-v3-its-msi-parent.c2
-rw-r--r--drivers/irqchip/irq-gic-v3-mbi.c2
-rw-r--r--drivers/irqchip/irq-mvebu-gicp.c2
-rw-r--r--drivers/irqchip/irq-mvebu-odmi.c2
-rw-r--r--drivers/irqchip/irq-riscv-imsic-state.c10
-rw-r--r--drivers/md/bcache/super.c3
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-integrity.c16
-rw-r--r--drivers/md/dm-raid.c3
-rw-r--r--drivers/md/md.c190
-rw-r--r--drivers/md/md.h18
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c8
-rw-r--r--drivers/mmc/host/sdhci-of-dwcmshc.c40
-rw-r--r--drivers/mmc/host/sdhci_am654.c35
-rw-r--r--drivers/net/can/kvaser_pciefd.c182
-rw-r--r--drivers/net/can/slcan/slcan-core.c26
-rw-r--r--drivers/net/dsa/b53/b53_common.c33
-rw-r--r--drivers/net/dsa/b53/b53_regs.h14
-rw-r--r--drivers/net/dsa/microchip/ksz_common.c135
-rw-r--r--drivers/net/dsa/sja1105/sja1105_main.c6
-rw-r--r--drivers/net/ethernet/airoha/airoha_eth.c22
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt.c36
-rw-r--r--drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c9
-rw-r--r--drivers/net/ethernet/cadence/macb_main.c19
-rw-r--r--drivers/net/ethernet/engleder/tsnep_main.c30
-rw-r--r--drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c16
-rw-r--r--drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c3
-rw-r--r--drivers/net/ethernet/intel/ice/ice_lag.c6
-rw-r--r--drivers/net/ethernet/intel/ice/ice_virtchnl.c1
-rw-r--r--drivers/net/ethernet/intel/idpf/idpf.h2
-rw-r--r--drivers/net/ethernet/intel/idpf/idpf_lib.c10
-rw-r--r--drivers/net/ethernet/intel/idpf/idpf_txrx.c18
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/af/cgx.c5
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c24
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c11
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c3
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h1
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c1
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c10
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c3
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c9
-rw-r--r--drivers/net/ethernet/mediatek/mtk_eth_soc.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c4
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c3
-rw-r--r--drivers/net/ethernet/microchip/lan743x_main.c19
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_main.c2
-rw-r--r--drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c7
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c2
-rw-r--r--drivers/net/ethernet/ti/am65-cpsw-nuss.c2
-rw-r--r--drivers/net/ethernet/wangxun/libwx/wx_hw.c10
-rw-r--r--drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c8
-rw-r--r--drivers/net/ethernet/wangxun/txgbe/txgbe_type.h2
-rw-r--r--drivers/net/hyperv/hyperv_net.h13
-rw-r--r--drivers/net/hyperv/netvsc.c57
-rw-r--r--drivers/net/hyperv/netvsc_drv.c62
-rw-r--r--drivers/net/hyperv/rndis_filter.c24
-rw-r--r--drivers/net/phy/micrel.c7
-rw-r--r--drivers/net/team/team_core.c6
-rw-r--r--drivers/net/vmxnet3/vmxnet3_drv.c5
-rw-r--r--drivers/net/wireless/mediatek/mt76/dma.c1
-rw-r--r--drivers/net/wireless/mediatek/mt76/mt7925/mcu.c4
-rw-r--r--drivers/nvme/common/auth.c15
-rw-r--r--drivers/nvme/host/auth.c30
-rw-r--r--drivers/nvme/host/core.c235
-rw-r--r--drivers/nvme/host/fc.c13
-rw-r--r--drivers/nvme/host/multipath.c209
-rw-r--r--drivers/nvme/host/nvme.h34
-rw-r--r--drivers/nvme/host/pci.c306
-rw-r--r--drivers/nvme/host/sysfs.c35
-rw-r--r--drivers/nvme/host/tcp.c14
-rw-r--r--drivers/nvme/target/admin-cmd.c31
-rw-r--r--drivers/nvme/target/auth.c21
-rw-r--r--drivers/nvme/target/core.c94
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fabrics-cmd.c12
-rw-r--r--drivers/nvme/target/fc.c96
-rw-r--r--drivers/nvme/target/fcloop.c439
-rw-r--r--drivers/nvme/target/loop.c29
-rw-r--r--drivers/nvme/target/nvmet.h24
-rw-r--r--drivers/nvme/target/pci-epf.c53
-rw-r--r--drivers/nvme/target/rdma.c8
-rw-r--r--drivers/nvme/target/tcp.c100
-rw-r--r--drivers/phy/phy-can-transceiver.c22
-rw-r--r--drivers/phy/qualcomm/phy-qcom-qmp-ufs.c3
-rw-r--r--drivers/phy/renesas/phy-rcar-gen3-usb2.c135
-rw-r--r--drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c2
-rw-r--r--drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c2
-rw-r--r--drivers/phy/starfive/phy-jh7110-usb.c7
-rw-r--r--drivers/phy/tegra/xusb-tegra186.c46
-rw-r--r--drivers/phy/tegra/xusb.c8
-rw-r--r--drivers/pinctrl/qcom/pinctrl-msm.c23
-rw-r--r--drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c2
-rw-r--r--drivers/platform/x86/fujitsu-laptop.c33
-rw-r--r--drivers/platform/x86/intel/pmc/arl.c3
-rw-r--r--drivers/platform/x86/think-lmi.c26
-rw-r--r--drivers/platform/x86/think-lmi.h1
-rw-r--r--drivers/platform/x86/thinkpad_acpi.c5
-rw-r--r--drivers/pmdomain/core.c2
-rw-r--r--drivers/pmdomain/renesas/rcar-gen4-sysc.c5
-rw-r--r--drivers/pmdomain/renesas/rcar-sysc.c5
-rw-r--r--drivers/ptp/ptp_ocp.c24
-rw-r--r--drivers/regulator/max20086-regulator.c7
-rw-r--r--drivers/remoteproc/qcom_wcnss.c3
-rw-r--r--drivers/scsi/Kconfig3
-rw-r--r--drivers/scsi/aha152x.c1
-rw-r--r--drivers/scsi/imm.c1
-rw-r--r--drivers/scsi/ppa.c1
-rw-r--r--drivers/scsi/scsi_ioctl.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/scsi/sd_zbc.c6
-rw-r--r--drivers/scsi/storvsc_drv.c1
-rw-r--r--drivers/soc/samsung/exynos-usi.c2
-rw-r--r--drivers/soundwire/bus.c9
-rw-r--r--drivers/spi/spi-fsl-dspi.c46
-rw-r--r--drivers/spi/spi-loopback-test.c2
-rw-r--r--drivers/spi/spi-sun4i.c5
-rw-r--r--drivers/spi/spi-tegra114.c6
-rw-r--r--drivers/thermal/intel/x86_pkg_temp_thermal.c1
-rw-r--r--drivers/usb/gadget/function/f_midi2.c2
-rw-r--r--drivers/usb/storage/usb.c20
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/dir_silly.c6
-rw-r--r--fs/aio.c1
-rw-r--r--fs/anon_inodes.c45
-rw-r--r--fs/autofs/dev-ioctl.c3
-rw-r--r--fs/bcachefs/backpointers.c117
-rw-r--r--fs/bcachefs/btree_cache.c9
-rw-r--r--fs/bcachefs/btree_iter.c24
-rw-r--r--fs/bcachefs/btree_key_cache.c25
-rw-r--r--fs/bcachefs/btree_key_cache.h3
-rw-r--r--fs/bcachefs/dirent.c33
-rw-r--r--fs/bcachefs/dirent.h2
-rw-r--r--fs/bcachefs/disk_accounting.c17
-rw-r--r--fs/bcachefs/disk_accounting.h16
-rw-r--r--fs/bcachefs/ec.c20
-rw-r--r--fs/bcachefs/extents.h7
-rw-r--r--fs/bcachefs/fs-io-pagecache.c18
-rw-r--r--fs/bcachefs/fs.c30
-rw-r--r--fs/bcachefs/fsck.c39
-rw-r--r--fs/bcachefs/inode.c36
-rw-r--r--fs/bcachefs/inode.h4
-rw-r--r--fs/bcachefs/journal_reclaim.c17
-rw-r--r--fs/bcachefs/namei.c2
-rw-r--r--fs/bcachefs/rebalance.c2
-rw-r--r--fs/bcachefs/sb-errors_format.h8
-rw-r--r--fs/bcachefs/xattr.c6
-rw-r--r--fs/bfs/inode.c30
-rw-r--r--fs/binfmt_elf.c76
-rw-r--r--fs/binfmt_misc.c2
-rw-r--r--fs/btrfs/ioctl.c9
-rw-r--r--fs/btrfs/scrub.c10
-rw-r--r--fs/buffer.c24
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/key.c1
-rw-r--r--fs/cachefiles/namei.c14
-rw-r--r--fs/coredump.c461
-rw-r--r--fs/dcache.c12
-rw-r--r--fs/debugfs/inode.c6
-rw-r--r--fs/ecryptfs/inode.c16
-rw-r--r--fs/efivarfs/internal.h1
-rw-r--r--fs/efivarfs/super.c206
-rw-r--r--fs/exec.c60
-rw-r--r--fs/exportfs/expfs.c6
-rw-r--r--fs/f2fs/gc.c6
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/filesystems.c14
-rw-r--r--fs/fs_context.c6
-rw-r--r--fs/fs_parser.c55
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/ops_fstype.c24
-rw-r--r--fs/gfs2/super.c24
-rw-r--r--fs/gfs2/sys.c4
-rw-r--r--fs/hfsplus/wrapper.c46
-rw-r--r--fs/internal.h7
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/iomap/buffered-io.c100
-rw-r--r--fs/iomap/trace.h27
-rw-r--r--fs/kernfs/mount.c17
-rw-r--r--fs/libfs.c13
-rw-r--r--fs/mpage.c13
-rw-r--r--fs/namei.c235
-rw-r--r--fs/namespace.c43
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/dir.c15
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c6
-rw-r--r--fs/nfs/localio.c2
-rw-r--r--fs/nfs/netns.h6
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs4proc.c18
-rw-r--r--fs/nfs/nfs4trace.h34
-rw-r--r--fs/nfs/pnfs.c51
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/nfs/pnfs_nfs.c32
-rw-r--r--fs/nfs/symlink.c20
-rw-r--r--fs/nfs/unlink.c11
-rw-r--r--fs/nfsd/nfs3proc.c4
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4recover.c13
-rw-r--r--fs/nfsd/nfs4xdr.c4
-rw-r--r--fs/nfsd/nfsproc.c5
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/omfs/inode.c176
-rw-r--r--fs/open.c14
-rw-r--r--fs/orangefs/inode.c9
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/overlayfs/namei.c14
-rw-r--r--fs/overlayfs/overlayfs.h2
-rw-r--r--fs/overlayfs/readdir.c21
-rw-r--r--fs/pidfs.c165
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/meminfo.c3
-rw-r--r--fs/proc_namespace.c12
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c4
-rw-r--r--fs/readdir.c47
-rw-r--r--fs/select.c4
-rw-r--r--fs/smb/client/cached_dir.c5
-rw-r--r--fs/smb/client/cifsfs.c3
-rw-r--r--fs/smb/client/file.c6
-rw-r--r--fs/smb/client/readdir.c10
-rw-r--r--fs/smb/client/smb2pdu.c2
-rw-r--r--fs/smb/server/oplock.c7
-rw-r--r--fs/smb/server/smb2pdu.c7
-rw-r--r--fs/smb/server/vfs.c16
-rw-r--r--fs/stat.c35
-rw-r--r--fs/super.c318
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/vboxsf/file.c47
-rw-r--r--fs/xfs/scrub/fscounters.c4
-rw-r--r--fs/xfs/scrub/orphanage.c7
-rw-r--r--fs/xfs/xfs_bio_io.c30
-rw-r--r--fs/xfs/xfs_buf.c43
-rw-r--r--fs/xfs/xfs_log.c32
-rw-r--r--fs/xfs/xfs_notify_failure.c6
-rw-r--r--fs/xfs/xfs_super.c28
-rw-r--r--fs/xfs/xfs_trans_ail.c34
-rw-r--r--fs/xfs/xfs_zone_gc.c5
-rw-r--r--fs/zonefs/super.c34
-rw-r--r--include/drm/drm_gpusvm.h47
-rw-r--r--include/drm/intel/pciids.h4
-rw-r--r--include/linux/alloc_tag.h12
-rw-r--r--include/linux/binfmts.h1
-rw-r--r--include/linux/bio.h26
-rw-r--r--include/linux/blk-mq.h10
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/blkdev.h24
-rw-r--r--include/linux/cgroup.h26
-rw-r--r--include/linux/codetag.h8
-rw-r--r--include/linux/coredump.h1
-rw-r--r--include/linux/dcache.h4
-rw-r--r--include/linux/device_cgroup.h7
-rw-r--r--include/linux/dmapool.h21
-rw-r--r--include/linux/file.h2
-rw-r--r--include/linux/fs.h45
-rw-r--r--include/linux/fs_parser.h7
-rw-r--r--include/linux/highmem.h10
-rw-r--r--include/linux/hugetlb.h5
-rw-r--r--include/linux/hyperv.h7
-rw-r--r--include/linux/io_uring/cmd.h9
-rw-r--r--include/linux/io_uring_types.h15
-rw-r--r--include/linux/micrel_phy.h1
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/mman.h2
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/mount.h87
-rw-r--r--include/linux/mroute_base.h5
-rw-r--r--include/linux/namei.h17
-rw-r--r--include/linux/net.h4
-rw-r--r--include/linux/nfs_fs_sb.h12
-rw-r--r--include/linux/nvme.h77
-rw-r--r--include/linux/page-flags.h7
-rw-r--r--include/linux/part_stat.h2
-rw-r--r--include/linux/percpu-rwsem.h20
-rw-r--r--include/linux/percpu.h4
-rw-r--r--include/linux/pgalloc_tag.h8
-rw-r--r--include/linux/pid.h2
-rw-r--r--include/linux/pidfs.h8
-rw-r--r--include/linux/shmem_fs.h7
-rw-r--r--include/linux/soundwire/sdw_intel.h2
-rw-r--r--include/linux/spi/spi.h5
-rw-r--r--include/net/bluetooth/hci_core.h1
-rw-r--r--include/net/netdev_lock.h3
-rw-r--r--include/net/sch_generic.h15
-rw-r--r--include/net/xfrm.h1
-rw-r--r--include/scsi/scsi_host.h2
-rw-r--r--include/sound/pcm.h2
-rw-r--r--include/sound/ump_msg.h4
-rw-r--r--include/trace/events/block.h17
-rw-r--r--include/trace/events/io_uring.h2
-rw-r--r--include/uapi/linux/blktrace_api.h2
-rw-r--r--include/uapi/linux/io_uring.h12
-rw-r--r--include/uapi/linux/pidfd.h18
-rw-r--r--include/uapi/linux/taskstats.h47
-rw-r--r--include/uapi/linux/ublk_cmd.h128
-rw-r--r--init/Kconfig10
-rw-r--r--io_uring/Makefile6
-rw-r--r--io_uring/advise.c4
-rw-r--r--io_uring/cancel.c2
-rw-r--r--io_uring/cmd_net.c83
-rw-r--r--io_uring/epoll.c4
-rw-r--r--io_uring/eventfd.c66
-rw-r--r--io_uring/eventfd.h3
-rw-r--r--io_uring/fdinfo.c86
-rw-r--r--io_uring/fs.c10
-rw-r--r--io_uring/futex.c6
-rw-r--r--io_uring/io-wq.c65
-rw-r--r--io_uring/io-wq.h5
-rw-r--r--io_uring/io_uring.c288
-rw-r--r--io_uring/io_uring.h4
-rw-r--r--io_uring/kbuf.c148
-rw-r--r--io_uring/kbuf.h8
-rw-r--r--io_uring/memmap.c13
-rw-r--r--io_uring/memmap.h4
-rw-r--r--io_uring/msg_ring.c2
-rw-r--r--io_uring/net.c76
-rw-r--r--io_uring/nop.c2
-rw-r--r--io_uring/notif.c1
-rw-r--r--io_uring/opdef.c11
-rw-r--r--io_uring/openclose.c139
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/poll.c4
-rw-r--r--io_uring/rsrc.c91
-rw-r--r--io_uring/rsrc.h28
-rw-r--r--io_uring/rw.c8
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/statx.c2
-rw-r--r--io_uring/sync.c6
-rw-r--r--io_uring/tctx.c2
-rw-r--r--io_uring/timeout.c13
-rw-r--r--io_uring/timeout.h13
-rw-r--r--io_uring/truncate.c2
-rw-r--r--io_uring/uring_cmd.c96
-rw-r--r--io_uring/uring_cmd.h6
-rw-r--r--io_uring/waitid.c2
-rw-r--r--io_uring/xattr.c8
-rw-r--r--io_uring/zcrx.c372
-rw-r--r--io_uring/zcrx.h26
-rw-r--r--ipc/mqueue.c5
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/exit.c10
-rw-r--r--kernel/fork.c97
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/module/main.c1
-rw-r--r--kernel/nsproxy.c30
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/suspend.c7
-rw-r--r--kernel/power/swap.c103
-rw-r--r--kernel/trace/blktrace.c11
-rw-r--r--lib/alloc_tag.c87
-rw-r--r--lib/codetag.c5
-rw-r--r--mm/cma.c5
-rw-r--r--mm/dmapool.c15
-rw-r--r--mm/hugetlb.c52
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kasan/shadow.c92
-rw-r--r--mm/memcontrol.c6
-rw-r--r--mm/memory.c2
-rw-r--r--mm/migrate.c60
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c96
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/shmem.c33
-rw-r--r--mm/show_mem.c4
-rw-r--r--mm/swap.h4
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c20
-rw-r--r--mm/userfaultfd.c12
-rw-r--r--mm/vma.c1
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c29
-rw-r--r--mm/zsmalloc.c8
-rw-r--r--net/batman-adv/hard-interface.c31
-rw-r--r--net/bluetooth/hci_conn.c24
-rw-r--r--net/bluetooth/hci_event.c73
-rw-r--r--net/bluetooth/l2cap_core.c15
-rw-r--r--net/bluetooth/mgmt.c9
-rw-r--r--net/bridge/br_nf_core.c7
-rw-r--r--net/bridge/br_private.h1
-rw-r--r--net/can/bcm.c79
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/devmem.c7
-rw-r--r--net/core/devmem.h2
-rw-r--r--net/core/netdev-genl.c11
-rw-r--r--net/core/sock.c12
-rw-r--r--net/dsa/tag_ksz.c19
-rw-r--r--net/ipv4/esp4.c53
-rw-r--r--net/ipv4/ipmr.c12
-rw-r--r--net/ipv4/xfrm4_input.c18
-rw-r--r--net/ipv6/esp6.c53
-rw-r--r--net/ipv6/ip6mr.c12
-rw-r--r--net/ipv6/xfrm6_input.c18
-rw-r--r--net/llc/af_llc.c8
-rw-r--r--net/mac80211/main.c6
-rw-r--r--net/mctp/device.c17
-rw-r--r--net/mctp/route.c4
-rw-r--r--net/sched/sch_codel.c2
-rw-r--r--net/sched/sch_fq.c2
-rw-r--r--net/sched/sch_fq_codel.c2
-rw-r--r--net/sched/sch_fq_pie.c2
-rw-r--r--net/sched/sch_hfsc.c6
-rw-r--r--net/sched/sch_hhf.c2
-rw-r--r--net/sched/sch_pie.c2
-rw-r--r--net/sunrpc/rpc_pipe.c12
-rw-r--r--net/tipc/crypto.c5
-rw-r--r--net/tls/tls_strp.c3
-rw-r--r--net/unix/af_unix.c137
-rw-r--r--net/xdp/xsk.c2
-rw-r--r--net/xfrm/espintcp.c4
-rw-r--r--net/xfrm/xfrm_ipcomp.c3
-rw-r--r--net/xfrm/xfrm_policy.c3
-rw-r--r--net/xfrm/xfrm_state.c6
-rw-r--r--security/apparmor/apparmorfs.c4
-rw-r--r--security/inode.c2
-rw-r--r--security/landlock/audit.c4
-rw-r--r--security/landlock/id.c33
-rw-r--r--security/landlock/syscalls.c3
-rw-r--r--security/selinux/selinuxfs.c4
-rw-r--r--sound/core/oss/pcm_oss.c3
-rw-r--r--sound/core/pcm_native.c11
-rw-r--r--sound/core/seq/seq_clientmgr.c52
-rw-r--r--sound/core/seq/seq_ump_convert.c18
-rw-r--r--sound/core/seq/seq_ump_convert.h1
-rw-r--r--sound/hda/intel-sdw-acpi.c2
-rw-r--r--sound/pci/es1968.c6
-rw-r--r--sound/pci/hda/patch_realtek.c9
-rw-r--r--sound/sh/Kconfig2
-rw-r--r--sound/soc/mediatek/Kconfig1
-rw-r--r--sound/soc/sof/intel/hda-bus.c2
-rw-r--r--sound/soc/sof/intel/hda.c16
-rw-r--r--sound/soc/sof/ipc4-control.c11
-rw-r--r--sound/soc/sof/ipc4-pcm.c3
-rw-r--r--sound/soc/sof/topology.c18
-rw-r--r--sound/usb/quirks.c4
-rw-r--r--tools/include/uapi/linux/fanotify.h274
-rw-r--r--tools/include/uapi/linux/mount.h235
-rw-r--r--tools/include/uapi/linux/nsfs.h45
-rwxr-xr-xtools/net/ynl/pyynl/ethtool.py22
-rwxr-xr-xtools/net/ynl/pyynl/ynl_gen_c.py7
-rw-r--r--tools/testing/selftests/bpf/config.aarch641
-rw-r--r--tools/testing/selftests/bpf/config.s390x1
-rw-r--r--tools/testing/selftests/coredump/stackdump_test.c477
-rw-r--r--tools/testing/selftests/drivers/net/hw/ncdevmem.c55
-rw-r--r--tools/testing/selftests/filesystems/.gitignore1
-rw-r--r--tools/testing/selftests/filesystems/Makefile2
-rw-r--r--tools/testing/selftests/filesystems/anon_inode_test.c69
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/.gitignore1
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/Makefile9
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c38
-rw-r--r--tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c557
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/Makefile2
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c2
-rw-r--r--tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c2
-rw-r--r--tools/testing/selftests/filesystems/statmount/Makefile6
-rw-r--r--tools/testing/selftests/filesystems/statmount/statmount.h36
-rw-r--r--tools/testing/selftests/filesystems/statmount/statmount_test_ns.c86
-rw-r--r--tools/testing/selftests/filesystems/utils.c88
-rw-r--r--tools/testing/selftests/filesystems/utils.h3
-rw-r--r--tools/testing/selftests/filesystems/wrappers.h (renamed from tools/testing/selftests/filesystems/overlayfs/wrappers.h)46
-rw-r--r--tools/testing/selftests/mount_setattr/Makefile2
-rw-r--r--tools/testing/selftests/mount_setattr/mount_setattr_test.c61
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h22
-rw-r--r--tools/testing/selftests/pidfd/pidfd_bind_mount.c74
-rw-r--r--tools/testing/selftests/pidfd/pidfd_info_test.c13
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json27
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json24
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json22
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json22
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json22
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json22
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json24
-rw-r--r--tools/testing/selftests/ublk/Makefile11
-rw-r--r--tools/testing/selftests/ublk/fault_inject.c5
-rw-r--r--tools/testing/selftests/ublk/file_backed.c17
-rw-r--r--tools/testing/selftests/ublk/kublk.c153
-rw-r--r--tools/testing/selftests/ublk/kublk.h22
-rw-r--r--tools/testing/selftests/ublk/null.c55
-rw-r--r--tools/testing/selftests/ublk/stripe.c26
-rwxr-xr-xtools/testing/selftests/ublk/test_common.sh39
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_04.sh2
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_05.sh2
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_06.sh2
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_08.sh32
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_09.sh28
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_10.sh30
-rwxr-xr-xtools/testing/selftests/ublk/test_generic_11.sh44
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_02.sh10
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_03.sh7
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_04.sh7
-rwxr-xr-xtools/testing/selftests/ublk/test_stress_05.sh9
-rw-r--r--tools/testing/vsock/vsock_test.c28
673 files changed, 15154 insertions, 7260 deletions
diff --git a/.mailmap b/.mailmap
index 9531546fcac9..a885e2eefc69 100644
--- a/.mailmap
+++ b/.mailmap
@@ -313,6 +313,7 @@ Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com>
Jan Kuliga <jtkuliga.kdev@gmail.com> <jankul@alatek.krakow.pl>
Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@linux.intel.com>
Jarkko Sakkinen <jarkko@kernel.org> <jarkko@profian.com>
+Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@opinsys.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@nvidia.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 11545c9e2e93..4ba771b56b3b 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -547,6 +547,21 @@ Description:
[RO] Maximum size in bytes of a single element in a DMA
scatter/gather list.
+What: /sys/block/<disk>/queue/max_write_streams
+Date: November 2024
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Maximum number of write streams supported, 0 if not
+ supported. If supported, valid values are 1 through
+ max_write_streams, inclusive.
+
+What: /sys/block/<disk>/queue/write_stream_granularity
+Date: November 2024
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Granularity of a write stream in bytes. The granularity
+ of a write stream is the size that should be discarded or
+ overwritten together to avoid write amplification in the device.
What: /sys/block/<disk>/queue/max_segments
Date: March 2010
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
index 2a19584d091e..8c9718d83e9d 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
+++ b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
@@ -1,6 +1,6 @@
What: /sys/bus/hid/drivers/hid-appletb-kbd/<dev>/mode
-Date: September, 2023
-KernelVersion: 6.5
+Date: March, 2025
+KernelVersion: 6.15
Contact: linux-input@vger.kernel.org
Description:
The set of keys displayed on the Touch Bar.
diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst
index 957ccf617797..3262397ebe8f 100644
--- a/Documentation/admin-guide/blockdev/index.rst
+++ b/Documentation/admin-guide/blockdev/index.rst
@@ -11,6 +11,7 @@ Block Devices
nbd
paride
ramdisk
+ zoned_loop
zram
drbd/index
diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst
new file mode 100644
index 000000000000..9c7aa3b482f3
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/zoned_loop.rst
@@ -0,0 +1,169 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+Zoned Loop Block Device
+=======================
+
+.. Contents:
+
+ 1) Overview
+ 2) Creating a Zoned Device
+ 3) Deleting a Zoned Device
+ 4) Example
+
+
+1) Overview
+-----------
+
+The zoned loop block device driver (zloop) allows a user to create a zoned block
+device using one regular file per zone as backing storage. This driver does not
+directly control any hardware and uses read, write and truncate operations to
+regular files of a file system to emulate a zoned block device.
+
+Using zloop, zoned block devices with a configurable capacity, zone size and
+number of conventional zones can be created. The storage for each zone of the
+device is implemented using a regular file with a maximum size equal to the zone
+size. The size of a file backing a conventional zone is always equal to the zone
+size. The size of a file backing a sequential zone indicates the amount of data
+sequentially written to the file, that is, the size of the file directly
+indicates the position of the write pointer of the zone.
+
+When resetting a sequential zone, its backing file size is truncated to zero.
+Conversely, for a zone finish operation, the backing file is truncated to the
+zone size. With this, the maximum capacity of a zloop zoned block device created
+can be larger configured to be larger than the storage space available on the
+backing file system. Of course, for such configuration, writing more data than
+the storage space available on the backing file system will result in write
+errors.
+
+The zoned loop block device driver implements a complete zone transition state
+machine. That is, zones can be empty, implicitly opened, explicitly opened,
+closed or full. The current implementation does not support any limits on the
+maximum number of open and active zones.
+
+No user tools are necessary to create and delete zloop devices.
+
+2) Creating a Zoned Device
+--------------------------
+
+Once the zloop module is loaded (or if zloop is compiled in the kernel), the
+character device file /dev/zloop-control can be used to add a zloop device.
+This is done by writing an "add" command directly to the /dev/zloop-control
+device::
+
+ $ modprobe zloop
+ $ ls -l /dev/zloop*
+ crw-------. 1 root root 10, 123 Jan 6 19:18 /dev/zloop-control
+
+ $ mkdir -p <base directory/<device ID>
+ $ echo "add [options]" > /dev/zloop-control
+
+The options available for the add command can be listed by reading the
+/dev/zloop-control device::
+
+ $ cat /dev/zloop-control
+ add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io
+ remove id=%d
+
+In more details, the options that can be used with the "add" command are as
+follows.
+
+================ ===========================================================
+id Device number (the X in /dev/zloopX).
+ Default: automatically assigned.
+capacity_mb Device total capacity in MiB. This is always rounded up to
+ the nearest higher multiple of the zone size.
+ Default: 16384 MiB (16 GiB).
+zone_size_mb Device zone size in MiB. Default: 256 MiB.
+zone_capacity_mb Device zone capacity (must always be equal to or lower than
+ the zone size. Default: zone size.
+conv_zones Total number of conventioanl zones starting from sector 0.
+ Default: 8.
+base_dir Path to the base directoy where to create the directory
+ containing the zone files of the device.
+ Default=/var/local/zloop.
+ The device directory containing the zone files is always
+ named with the device ID. E.g. the default zone file
+ directory for /dev/zloop0 is /var/local/zloop/0.
+nr_queues Number of I/O queues of the zoned block device. This value is
+ always capped by the number of online CPUs
+ Default: 1
+queue_depth Maximum I/O queue depth per I/O queue.
+ Default: 64
+buffered_io Do buffered IOs instead of direct IOs (default: false)
+================ ===========================================================
+
+3) Deleting a Zoned Device
+--------------------------
+
+Deleting an unused zoned loop block device is done by issuing the "remove"
+command to /dev/zloop-control, specifying the ID of the device to remove::
+
+ $ echo "remove id=X" > /dev/zloop-control
+
+The remove command does not have any option.
+
+A zoned device that was removed can be re-added again without any change to the
+state of the device zones: the device zones are restored to their last state
+before the device was removed. Adding again a zoned device after it was removed
+must always be done using the same configuration as when the device was first
+added. If a zone configuration change is detected, an error will be returned and
+the zoned device will not be created.
+
+To fully delete a zoned device, after executing the remove operation, the device
+base directory containing the backing files of the device zones must be deleted.
+
+4) Example
+----------
+
+The following sequence of commands creates a 2GB zoned device with zones of 64
+MB and a zone capacity of 63 MB::
+
+ $ modprobe zloop
+ $ mkdir -p /var/local/zloop/0
+ $ echo "add capacity_mb=2048,zone_size_mb=64,zone_capacity=63MB" > /dev/zloop-control
+
+For the device created (/dev/zloop0), the zone backing files are all created
+under the default base directory (/var/local/zloop)::
+
+ $ ls -l /var/local/zloop/0
+ total 0
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000000
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000001
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000002
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000003
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000004
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000005
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000006
+ -rw-------. 1 root root 67108864 Jan 6 22:23 cnv-000007
+ -rw-------. 1 root root 0 Jan 6 22:23 seq-000008
+ -rw-------. 1 root root 0 Jan 6 22:23 seq-000009
+ ...
+
+The zoned device created (/dev/zloop0) can then be used normally::
+
+ $ lsblk -z
+ NAME ZONED ZONE-SZ ZONE-NR ZONE-AMAX ZONE-OMAX ZONE-APP ZONE-WGRAN
+ zloop0 host-managed 64M 32 0 0 1M 4K
+ $ blkzone report /dev/zloop0
+ start: 0x000000000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x000020000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x000040000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x000060000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x000080000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x0000a0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x0000c0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x0000e0000, len 0x020000, cap 0x020000, wptr 0x000000 reset:0 non-seq:0, zcond: 0(nw) [type: 1(CONVENTIONAL)]
+ start: 0x000100000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
+ start: 0x000120000, len 0x020000, cap 0x01f800, wptr 0x000000 reset:0 non-seq:0, zcond: 1(em) [type: 2(SEQ_WRITE_REQUIRED)]
+ ...
+
+Deleting this device is done using the command::
+
+ $ echo "remove id=0" > /dev/zloop-control
+
+The removed device can be re-added again using the same "add" command as when
+the device was first created. To fully delete a zoned device, its backing files
+should also be deleted after executing the remove command::
+
+ $ rm -r /var/local/zloop/0
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 1a16ce68a4d7..9e7de8e70048 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -3019,7 +3019,7 @@ Filesystem Support for Writeback
--------------------------------
A filesystem can support cgroup writeback by updating
-address_space_operations->writepage[s]() to annotate bio's using the
+address_space_operations->writepages() to annotate bio's using the
following two functions.
wbc_init_bio(@wbc, @bio)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8f75ec177399..30532eb7bb19 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6268,7 +6268,7 @@
port and the regular usb controller gets disabled.
root= [KNL] Root filesystem
- Usually this a a block device specifier of some kind,
+ Usually this is a block device specifier of some kind,
see the early_lookup_bdev comment in
block/early-lookup.c for details.
Alternatively this can be "ram" for the legacy initial
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 8290177b4f75..d385985b305f 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -75,6 +75,7 @@ Currently, these files are in /proc/sys/vm:
- unprivileged_userfaultfd
- user_reserve_kbytes
- vfs_cache_pressure
+- vfs_cache_pressure_denom
- watermark_boost_factor
- watermark_scale_factor
- zone_reclaim_mode
@@ -1017,19 +1018,28 @@ vfs_cache_pressure
This percentage value controls the tendency of the kernel to reclaim
the memory which is used for caching of directory and inode objects.
-At the default value of vfs_cache_pressure=100 the kernel will attempt to
-reclaim dentries and inodes at a "fair" rate with respect to pagecache and
-swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
-to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
-never reclaim dentries and inodes due to memory pressure and this can easily
-lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
-causes the kernel to prefer to reclaim dentries and inodes.
+At the default value of vfs_cache_pressure=vfs_cache_pressure_denom the kernel
+will attempt to reclaim dentries and inodes at a "fair" rate with respect to
+pagecache and swapcache reclaim. Decreasing vfs_cache_pressure causes the
+kernel to prefer to retain dentry and inode caches. When vfs_cache_pressure=0,
+the kernel will never reclaim dentries and inodes due to memory pressure and
+this can easily lead to out-of-memory conditions. Increasing vfs_cache_pressure
+beyond vfs_cache_pressure_denom causes the kernel to prefer to reclaim dentries
+and inodes.
-Increasing vfs_cache_pressure significantly beyond 100 may have negative
-performance impact. Reclaim code needs to take various locks to find freeable
-directory and inode objects. With vfs_cache_pressure=1000, it will look for
-ten times more freeable objects than there are.
+Increasing vfs_cache_pressure significantly beyond vfs_cache_pressure_denom may
+have negative performance impact. Reclaim code needs to take various locks to
+find freeable directory and inode objects. When vfs_cache_pressure equals
+(10 * vfs_cache_pressure_denom), it will look for ten times more freeable
+objects than there are.
+Note: This setting should always be used together with vfs_cache_pressure_denom.
+
+vfs_cache_pressure_denom
+========================
+
+Defaults to 100 (minimum allowed value). Requires corresponding
+vfs_cache_pressure setting to take effect.
watermark_boost_factor
======================
diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
index e0ec53bc10c6..1525a50ded47 100644
--- a/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
+++ b/Documentation/devicetree/bindings/net/can/microchip,mcp2510.yaml
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: http://devicetree.org/schemas/can/microchip,mcp2510.yaml#
+$id: http://devicetree.org/schemas/net/can/microchip,mcp2510.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Microchip MCP251X stand-alone CAN controller
diff --git a/Documentation/driver-api/early-userspace/buffer-format.rst b/Documentation/driver-api/early-userspace/buffer-format.rst
index 7f74e301fdf3..726bfa2fe70d 100644
--- a/Documentation/driver-api/early-userspace/buffer-format.rst
+++ b/Documentation/driver-api/early-userspace/buffer-format.rst
@@ -4,20 +4,18 @@ initramfs buffer format
Al Viro, H. Peter Anvin
-Last revision: 2002-01-13
-
-Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
-getting {replaced/complemented} with the new "initial ramfs"
-(initramfs) protocol. The initramfs contents is passed using the same
-memory buffer protocol used by the initrd protocol, but the contents
+With kernel 2.5.x, the old "initial ramdisk" protocol was complemented
+with an "initial ramfs" protocol. The initramfs content is passed
+using the same memory buffer protocol used by initrd, but the content
is different. The initramfs buffer contains an archive which is
-expanded into a ramfs filesystem; this document details the format of
-the initramfs buffer format.
+expanded into a ramfs filesystem; this document details the initramfs
+buffer format.
The initramfs buffer format is based around the "newc" or "crc" CPIO
formats, and can be created with the cpio(1) utility. The cpio
-archive can be compressed using gzip(1). One valid version of an
-initramfs buffer is thus a single .cpio.gz file.
+archive can be compressed using gzip(1), or any other algorithm provided
+via CONFIG_DECOMPRESS_*. One valid version of an initramfs buffer is
+thus a single .cpio.gz file.
The full format of the initramfs buffer is defined by the following
grammar, where::
@@ -25,12 +23,20 @@ grammar, where::
* is used to indicate "0 or more occurrences of"
(|) indicates alternatives
+ indicates concatenation
- GZIP() indicates the gzip(1) of the operand
+ GZIP() indicates gzip compression of the operand
+ BZIP2() indicates bzip2 compression of the operand
+ LZMA() indicates lzma compression of the operand
+ XZ() indicates xz compression of the operand
+ LZO() indicates lzo compression of the operand
+ LZ4() indicates lz4 compression of the operand
+ ZSTD() indicates zstd compression of the operand
ALGN(n) means padding with null bytes to an n-byte boundary
- initramfs := ("\0" | cpio_archive | cpio_gzip_archive)*
+ initramfs := ("\0" | cpio_archive | cpio_compressed_archive)*
- cpio_gzip_archive := GZIP(cpio_archive)
+ cpio_compressed_archive := (GZIP(cpio_archive) | BZIP2(cpio_archive)
+ | LZMA(cpio_archive) | XZ(cpio_archive) | LZO(cpio_archive)
+ | LZ4(cpio_archive) | ZSTD(cpio_archive))
cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
@@ -75,6 +81,8 @@ c_chksum 8 bytes Checksum of data field if c_magic is 070702;
The c_mode field matches the contents of st_mode returned by stat(2)
on Linux, and encodes the file type and file permissions.
+c_mtime is ignored unless CONFIG_INITRAMFS_PRESERVE_MTIME=y is set.
+
The c_filesize should be zero for any file which is not a regular file
or symlink.
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index e80329908549..3d22e2db732d 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -1409,7 +1409,7 @@ read the ciphertext into the page cache and decrypt it in-place. The
folio lock must be held until decryption has finished, to prevent the
folio from becoming visible to userspace prematurely.
-For the write path (->writepage()) of regular files, filesystems
+For the write path (->writepages()) of regular files, filesystems
cannot encrypt data in-place in the page cache, since the cached
plaintext must be preserved. Instead, filesystems must encrypt into a
temporary buffer or "bounce page", then write out the temporary
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index e29651a42eec..f2df9b6df988 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -243,13 +243,25 @@ The fields are as follows:
regular file data.
This is only useful for FIEMAP.
- * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
- be set by the filesystem for its own purposes.
+ * **IOMAP_F_BOUNDARY**: This indicates I/O and its completion must not be
+ merged with any other I/O or completion. Filesystems must use this when
+ submitting I/O to devices that cannot handle I/O crossing certain LBAs
+ (e.g. ZNS devices). This flag applies only to buffered I/O writeback; all
+ other functions ignore it.
+
+ * **IOMAP_F_PRIVATE**: This flag is reserved for filesystem private use.
* **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
block assigned to it yet and the file system will do that in the bio
submission handler, splitting the I/O as needed.
+ * **IOMAP_F_ATOMIC_BIO**: This indicates write I/O must be submitted with the
+ ``REQ_ATOMIC`` flag set in the bio. Filesystems need to set this flag to
+ inform iomap that the write I/O operation requires torn-write protection
+ based on HW-offload mechanism. They must also ensure that mapping updates
+ upon the completion of the I/O must be performed in a single metadata
+ update.
+
These flags can be set by iomap itself during file operations.
The filesystem should supply an ``->iomap_end`` function if it needs
to observe these flags:
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 0ec0bb6eb0fb..2e567e341c3b 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -249,7 +249,6 @@ address_space_operations
========================
prototypes::
- int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*read_folio)(struct file *, struct folio *);
int (*writepages)(struct address_space *, struct writeback_control *);
bool (*dirty_folio)(struct address_space *, struct folio *folio);
@@ -280,7 +279,6 @@ locking rules:
====================== ======================== ========= ===============
ops folio locked i_rwsem invalidate_lock
====================== ======================== ========= ===============
-writepage: yes, unlocks (see below)
read_folio: yes, unlocks shared
writepages:
dirty_folio: maybe
@@ -309,54 +307,6 @@ completion.
->readahead() unlocks the folios that I/O is attempted on like ->read_folio().
-->writepage() is used for two purposes: for "memory cleansing" and for
-"sync". These are quite different operations and the behaviour may differ
-depending upon the mode.
-
-If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
-it *must* start I/O against the page, even if that would involve
-blocking on in-progress I/O.
-
-If writepage is called for memory cleansing (sync_mode ==
-WBC_SYNC_NONE) then its role is to get as much writeout underway as
-possible. So writepage should try to avoid blocking against
-currently-in-progress I/O.
-
-If the filesystem is not called for "sync" and it determines that it
-would need to block against in-progress I/O to be able to start new I/O
-against the page the filesystem should redirty the page with
-redirty_page_for_writepage(), then unlock the page and return zero.
-This may also be done to avoid internal deadlocks, but rarely.
-
-If the filesystem is called for sync then it must wait on any
-in-progress I/O and then start new I/O.
-
-The filesystem should unlock the page synchronously, before returning to the
-caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
-value. WRITEPAGE_ACTIVATE means that page cannot really be written out
-currently, and VM should stop calling ->writepage() on this page for some
-time. VM does this by moving page to the head of the active list, hence the
-name.
-
-Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
-and return zero, writepage *must* run set_page_writeback() against the page,
-followed by unlocking it. Once set_page_writeback() has been run against the
-page, write I/O can be submitted and the write I/O completion handler must run
-end_page_writeback() once the I/O is complete. If no I/O is submitted, the
-filesystem must run end_page_writeback() against the page before returning from
-writepage.
-
-That is: after 2.5.12, pages which are under writeout are *not* locked. Note,
-if the filesystem needs the page to be locked during writeout, that is ok, too,
-the page is allowed to be unlocked at any point in time between the calls to
-set_page_writeback() and end_page_writeback().
-
-Note, failure to run either redirty_page_for_writepage() or the combination of
-set_page_writeback()/end_page_writeback() on a page submitted to writepage
-will leave the page itself marked clean but it will be tagged as dirty in the
-radix tree. This incoherency can lead to all sorts of hard-to-debug problems
-in the filesystem like having dirty inodes at umount and losing written data.
-
->writepages() is used for periodic writeback and for syscall-initiated
sync operations. The address_space should start I/O against at least
``*nr_to_write`` pages. ``*nr_to_write`` must be decremented for each page
@@ -364,8 +314,8 @@ which is written. The address_space implementation may write more (or less)
pages than ``*nr_to_write`` asks for, but it should try to be reasonably close.
If nr_to_write is NULL, all dirty pages must be written.
-writepages should _only_ write pages which are present on
-mapping->io_pages.
+writepages should _only_ write pages which are present in
+mapping->i_pages.
->dirty_folio() is called from various places in the kernel when
the target folio is marked as needing writeback. The folio cannot be
diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst
index d92c276f1575..e149b89118c8 100644
--- a/Documentation/filesystems/mount_api.rst
+++ b/Documentation/filesystems/mount_api.rst
@@ -671,7 +671,6 @@ The members are as follows:
fsparam_bool() fs_param_is_bool
fsparam_u32() fs_param_is_u32
fsparam_u32oct() fs_param_is_u32_octal
- fsparam_u32hex() fs_param_is_u32_hex
fsparam_s32() fs_param_is_s32
fsparam_u64() fs_param_is_u64
fsparam_enum() fs_param_is_enum
@@ -755,21 +754,6 @@ process the parameters it is given.
* ::
- bool validate_constant_table(const struct constant_table *tbl,
- size_t tbl_size,
- int low, int high, int special);
-
- Validate a constant table. Checks that all the elements are appropriately
- ordered, that there are no duplicates and that the values are between low
- and high inclusive, though provision is made for one allowable special
- value outside of that range. If no special value is required, special
- should just be set to lie inside the low-to-high range.
-
- If all is good, true is returned. If the table is invalid, errors are
- logged to the kernel log buffer and false is returned.
-
- * ::
-
bool fs_validate_description(const char *name,
const struct fs_parameter_description *desc);
diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst
index 3886c14f89f4..939b4b624fad 100644
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -1,33 +1,187 @@
.. SPDX-License-Identifier: GPL-2.0
-=================================
-Network Filesystem Helper Library
-=================================
+===================================
+Network Filesystem Services Library
+===================================
.. Contents:
- Overview.
+ - Requests and streams.
+ - Subrequests.
+ - Result collection and retry.
+ - Local caching.
+ - Content encryption (fscrypt).
- Per-inode context.
- Inode context helper functions.
- - Buffered read helpers.
- - Read helper functions.
- - Read helper structures.
- - Read helper operations.
- - Read helper procedure.
- - Read helper cache API.
+ - Inode locking.
+ - Inode writeback.
+ - High-level VFS API.
+ - Unlocked read/write iter.
+ - Pre-locked read/write iter.
+ - Monolithic files API.
+ - Memory-mapped I/O API.
+ - High-level VM API.
+ - Deprecated PG_private2 API.
+ - I/O request API.
+ - Request structure.
+ - Stream structure.
+ - Subrequest structure.
+ - Filesystem methods.
+ - Terminating a subrequest.
+ - Local cache API.
+ - API function reference.
Overview
========
-The network filesystem helper library is a set of functions designed to aid a
-network filesystem in implementing VM/VFS operations. For the moment, that
-just includes turning various VM buffered read operations into requests to read
-from the server. The helper library, however, can also interpose other
-services, such as local caching or local data encryption.
+The network filesystem services library, netfslib, is a set of functions
+designed to aid a network filesystem in implementing VM/VFS API operations. It
+takes over the normal buffered read, readahead, write and writeback and also
+handles unbuffered and direct I/O.
-Note that the library module doesn't link against local caching directly, so
-access must be provided by the netfs.
+The library provides support for (re-)negotiation of I/O sizes and retrying
+failed I/O as well as local caching and will, in the future, provide content
+encryption.
+
+It insulates the filesystem from VM interface changes as much as possible and
+handles VM features such as large multipage folios. The filesystem basically
+just has to provide a way to perform read and write RPC calls.
+
+The way I/O is organised inside netfslib consists of a number of objects:
+
+ * A *request*. A request is used to track the progress of the I/O overall and
+ to hold on to resources. The collection of results is done at the request
+ level. The I/O within a request is divided into a number of parallel
+ streams of subrequests.
+
+ * A *stream*. A non-overlapping series of subrequests. The subrequests
+ within a stream do not have to be contiguous.
+
+ * A *subrequest*. This is the basic unit of I/O. It represents a single RPC
+ call or a single cache I/O operation. The library passes these to the
+ filesystem and the cache to perform.
+
+Requests and Streams
+--------------------
+
+When actually performing I/O (as opposed to just copying into the pagecache),
+netfslib will create one or more requests to track the progress of the I/O and
+to hold resources.
+
+A read operation will have a single stream and the subrequests within that
+stream may be of mixed origins, for instance mixing RPC subrequests and cache
+subrequests.
+
+On the other hand, a write operation may have multiple streams, where each
+stream targets a different destination. For instance, there may be one stream
+writing to the local cache and one to the server. Currently, only two streams
+are allowed, but this could be increased if parallel writes to multiple servers
+is desired.
+
+The subrequests within a write stream do not need to match alignment or size
+with the subrequests in another write stream and netfslib performs the tiling
+of subrequests in each stream over the source buffer independently. Further,
+each stream may contain holes that don't correspond to holes in the other
+stream.
+
+In addition, the subrequests do not need to correspond to the boundaries of the
+folios or vectors in the source/destination buffer. The library handles the
+collection of results and the wrangling of folio flags and references.
+
+Subrequests
+-----------
+
+Subrequests are at the heart of the interaction between netfslib and the
+filesystem using it. Each subrequest is expected to correspond to a single
+read or write RPC or cache operation. The library will stitch together the
+results from a set of subrequests to provide a higher level operation.
+
+Netfslib has two interactions with the filesystem or the cache when setting up
+a subrequest. First, there's an optional preparatory step that allows the
+filesystem to negotiate the limits on the subrequest, both in terms of maximum
+number of bytes and maximum number of vectors (e.g. for RDMA). This may
+involve negotiating with the server (e.g. cifs needing to acquire credits).
+
+And, secondly, there's the issuing step in which the subrequest is handed off
+to the filesystem to perform.
+
+Note that these two steps are done slightly differently between read and write:
+
+ * For reads, the VM/VFS tells us how much is being requested up front, so the
+ library can preset maximum values that the cache and then the filesystem can
+ then reduce. The cache also gets consulted first on whether it wants to do
+ a read before the filesystem is consulted.
+
+ * For writeback, it is unknown how much there will be to write until the
+ pagecache is walked, so no limit is set by the library.
+
+Once a subrequest is completed, the filesystem or cache informs the library of
+the completion and then collection is invoked. Depending on whether the
+request is synchronous or asynchronous, the collection of results will be done
+in either the application thread or in a work queue.
+
+Result Collection and Retry
+---------------------------
+
+As subrequests complete, the results are collected and collated by the library
+and folio unlocking is performed progressively (if appropriate). Once the
+request is complete, async completion will be invoked (again, if appropriate).
+It is possible for the filesystem to provide interim progress reports to the
+library to cause folio unlocking to happen earlier if possible.
+
+If any subrequests fail, netfslib can retry them. It will wait until all
+subrequests are completed, offer the filesystem the opportunity to fiddle with
+the resources/state held by the request and poke at the subrequests before
+re-preparing and re-issuing the subrequests.
+
+This allows the tiling of contiguous sets of failed subrequest within a stream
+to be changed, adding more subrequests or ditching excess as necessary (for
+instance, if the network sizes change or the server decides it wants smaller
+chunks).
+
+Further, if one or more contiguous cache-read subrequests fail, the library
+will pass them to the filesystem to perform instead, renegotiating and retiling
+them as necessary to fit with the filesystem's parameters rather than those of
+the cache.
+
+Local Caching
+-------------
+
+One of the services netfslib provides, via ``fscache``, is the option to cache
+on local disk a copy of the data obtained from/written to a network filesystem.
+The library will manage the storing, retrieval and some invalidation of data
+automatically on behalf of the filesystem if a cookie is attached to the
+``netfs_inode``.
+
+Note that local caching used to use the PG_private_2 (aliased as PG_fscache) to
+keep track of a page that was being written to the cache, but this is now
+deprecated as PG_private_2 will be removed.
+
+Instead, folios that are read from the server for which there was no data in
+the cache will be marked as dirty and will have ``folio->private`` set to a
+special value (``NETFS_FOLIO_COPY_TO_CACHE``) and left to writeback to write.
+If the folio is modified before that happened, the special value will be
+cleared and the write will become normally dirty.
+
+When writeback occurs, folios that are so marked will only be written to the
+cache and not to the server. Writeback handles mixed cache-only writes and
+server-and-cache writes by using two streams, sending one to the cache and one
+to the server. The server stream will have gaps in it corresponding to those
+folios.
+
+Content Encryption (fscrypt)
+----------------------------
+
+Though it does not do so yet, at some point netfslib will acquire the ability
+to do client-side content encryption on behalf of the network filesystem (Ceph,
+for example). fscrypt can be used for this if appropriate (it may not be -
+cifs, for example).
+
+The data will be stored encrypted in the local cache using the same manner of
+encryption as the data written to the server and the library will impose bounce
+buffering and RMW cycles as necessary.
Per-Inode Context
@@ -40,10 +194,13 @@ structure is defined::
struct netfs_inode {
struct inode inode;
const struct netfs_request_ops *ops;
- struct fscache_cookie *cache;
+ struct fscache_cookie * cache;
+ loff_t remote_i_size;
+ unsigned long flags;
+ ...
};
-A network filesystem that wants to use netfs lib must place one of these in its
+A network filesystem that wants to use netfslib must place one of these in its
inode wrapper struct instead of the VFS ``struct inode``. This can be done in
a way similar to the following::
@@ -56,7 +213,8 @@ This allows netfslib to find its state by using ``container_of()`` from the
inode pointer, thereby allowing the netfslib helper functions to be pointed to
directly by the VFS/VM operation tables.
-The structure contains the following fields:
+The structure contains the following fields that are of interest to the
+filesystem:
* ``inode``
@@ -71,6 +229,37 @@ The structure contains the following fields:
Local caching cookie, or NULL if no caching is enabled. This field does not
exist if fscache is disabled.
+ * ``remote_i_size``
+
+ The size of the file on the server. This differs from inode->i_size if
+ local modifications have been made but not yet written back.
+
+ * ``flags``
+
+ A set of flags, some of which the filesystem might be interested in:
+
+ * ``NETFS_ICTX_MODIFIED_ATTR``
+
+ Set if netfslib modifies mtime/ctime. The filesystem is free to ignore
+ this or clear it.
+
+ * ``NETFS_ICTX_UNBUFFERED``
+
+ Do unbuffered I/O upon the file. Like direct I/O but without the
+ alignment limitations. RMW will be performed if necessary. The pagecache
+ will not be used unless mmap() is also used.
+
+ * ``NETFS_ICTX_WRITETHROUGH``
+
+ Do writethrough caching upon the file. I/O will be set up and dispatched
+ as buffered writes are made to the page cache. mmap() does the normal
+ writeback thing.
+
+ * ``NETFS_ICTX_SINGLE_NO_UPLOAD``
+
+ Set if the file has a monolithic content that must be read entirely in a
+ single go and must not be written back to the server, though it can be
+ cached (e.g. AFS directories).
Inode Context Helper Functions
------------------------------
@@ -84,117 +273,250 @@ set the operations table pointer::
then a function to cast from the VFS inode structure to the netfs context::
- struct netfs_inode *netfs_node(struct inode *inode);
+ struct netfs_inode *netfs_inode(struct inode *inode);
and finally, a function to get the cache cookie pointer from the context
attached to an inode (or NULL if fscache is disabled)::
struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx);
+Inode Locking
+-------------
+
+A number of functions are provided to manage the locking of i_rwsem for I/O and
+to effectively extend it to provide more separate classes of exclusion::
+
+ int netfs_start_io_read(struct inode *inode);
+ void netfs_end_io_read(struct inode *inode);
+ int netfs_start_io_write(struct inode *inode);
+ void netfs_end_io_write(struct inode *inode);
+ int netfs_start_io_direct(struct inode *inode);
+ void netfs_end_io_direct(struct inode *inode);
+
+The exclusion breaks down into four separate classes:
+
+ 1) Buffered reads and writes.
+
+ Buffered reads can run concurrently each other and with buffered writes,
+ but buffered writes cannot run concurrently with each other.
+
+ 2) Direct reads and writes.
+
+ Direct (and unbuffered) reads and writes can run concurrently since they do
+ not share local buffering (i.e. the pagecache) and, in a network
+ filesystem, are expected to have exclusion managed on the server (though
+ this may not be the case for, say, Ceph).
+
+ 3) Other major inode modifying operations (e.g. truncate, fallocate).
+
+ These should just access i_rwsem directly.
+
+ 4) mmap().
+
+ mmap'd accesses might operate concurrently with any of the other classes.
+ They might form the buffer for an intra-file loopback DIO read/write. They
+ might be permitted on unbuffered files.
+
+Inode Writeback
+---------------
+
+Netfslib will pin resources on an inode for future writeback (such as pinning
+use of an fscache cookie) when an inode is dirtied. However, this pinning
+needs careful management. To manage the pinning, the following sequence
+occurs:
+
+ 1) An inode state flag ``I_PINNING_NETFS_WB`` is set by netfslib when the
+ pinning begins (when a folio is dirtied, for example) if the cache is
+ active to stop the cache structures from being discarded and the cache
+ space from being culled. This also prevents re-getting of cache resources
+ if the flag is already set.
+
+ 2) This flag then cleared inside the inode lock during inode writeback in the
+ VM - and the fact that it was set is transferred to ``->unpinned_netfs_wb``
+ in ``struct writeback_control``.
+
+ 3) If ``->unpinned_netfs_wb`` is now set, the write_inode procedure is forced.
+
+ 4) The filesystem's ``->write_inode()`` function is invoked to do the cleanup.
+
+ 5) The filesystem invokes netfs to do its cleanup.
+
+To do the cleanup, netfslib provides a function to do the resource unpinning::
+
+ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
+
+If the filesystem doesn't need to do anything else, this may be set as a its
+``.write_inode`` method.
+
+Further, if an inode is deleted, the filesystem's write_inode method may not
+get called, so::
+
+ void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
-Buffered Read Helpers
-=====================
+must be called from ``->evict_inode()`` *before* ``clear_inode()`` is called.
-The library provides a set of read helpers that handle the ->read_folio(),
-->readahead() and much of the ->write_begin() VM operations and translate them
-into a common call framework.
-The following services are provided:
+High-Level VFS API
+==================
- * Handle folios that span multiple pages.
+Netfslib provides a number of sets of API calls for the filesystem to delegate
+VFS operations to. Netfslib, in turn, will call out to the filesystem and the
+cache to negotiate I/O sizes, issue RPCs and provide places for it to intervene
+at various times.
- * Insulate the netfs from VM interface changes.
+Unlocked Read/Write Iter
+------------------------
- * Allow the netfs to arbitrarily split reads up into pieces, even ones that
- don't match folio sizes or folio alignments and that may cross folios.
+The first API set is for the delegation of operations to netfslib when the
+filesystem is called through the standard VFS read/write_iter methods::
- * Allow the netfs to expand a readahead request in both directions to meet its
- needs.
+ ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
- * Allow the netfs to partially fulfil a read, which will then be resubmitted.
+They can be assigned directly to ``.read_iter`` and ``.write_iter``. They
+perform the inode locking themselves and the first two will switch between
+buffered I/O and DIO as appropriate.
- * Handle local caching, allowing cached data and server-read data to be
- interleaved for a single request.
+Pre-Locked Read/Write Iter
+--------------------------
- * Handle clearing of bufferage that isn't on the server.
+The second API set is for the delegation of operations to netfslib when the
+filesystem is called through the standard VFS methods, but needs to do some
+other stuff before or after calling netfslib whilst still inside locked section
+(e.g. Ceph negotiating caps). The unbuffered read function is::
- * Handle retrying of reads that failed, switching reads from the cache to the
- server as necessary.
+ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter);
- * In the future, this is a place that other services can be performed, such as
- local encryption of data to be stored remotely or in the cache.
+This must not be assigned directly to ``.read_iter`` and the filesystem is
+responsible for performing the inode locking before calling it. In the case of
+buffered read, the filesystem should use ``filemap_read()``.
-From the network filesystem, the helpers require a table of operations. This
-includes a mandatory method to issue a read operation along with a number of
-optional methods.
+There are three functions for writes::
+ ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group);
+ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group);
+ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group);
-Read Helper Functions
+These must not be assigned directly to ``.write_iter`` and the filesystem is
+responsible for performing the inode locking before calling them.
+
+The first two functions are for buffered writes; the first just adds some
+standard write checks and jumps to the second, but if the filesystem wants to
+do the checks itself, it can use the second directly. The third function is
+for unbuffered or DIO writes.
+
+On all three write functions, there is a writeback group pointer (which should
+be NULL if the filesystem doesn't use this). Writeback groups are set on
+folios when they're modified. If a folio to-be-modified is already marked with
+a different group, it is flushed first. The writeback API allows writing back
+of a specific group.
+
+Memory-Mapped I/O API
---------------------
-Three read helpers are provided::
+An API for support of mmap()'d I/O is provided::
+
+ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
+
+This allows the filesystem to delegate ``.page_mkwrite`` to netfslib. The
+filesystem should not take the inode lock before calling it, but, as with the
+locked write functions above, this does take a writeback group pointer. If the
+page to be made writable is in a different group, it will be flushed first.
+
+Monolithic Files API
+--------------------
+
+There is also a special API set for files for which the content must be read in
+a single RPC (and not written back) and is maintained as a monolithic blob
+(e.g. an AFS directory), though it can be stored and updated in the local cache::
+
+ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter);
+ void netfs_single_mark_inode_dirty(struct inode *inode);
+ int netfs_writeback_single(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct iov_iter *iter);
+
+The first function reads from a file into the given buffer, reading from the
+cache in preference if the data is cached there; the second function allows the
+inode to be marked dirty, causing a later writeback; and the third function can
+be called from the writeback code to write the data to the cache, if there is
+one.
- void netfs_readahead(struct readahead_control *ractl);
- int netfs_read_folio(struct file *file,
- struct folio *folio);
- int netfs_write_begin(struct netfs_inode *ctx,
- struct file *file,
- struct address_space *mapping,
- loff_t pos,
- unsigned int len,
- struct folio **_folio,
- void **_fsdata);
+The inode should be marked ``NETFS_ICTX_SINGLE_NO_UPLOAD`` if this API is to be
+used. The writeback function requires the buffer to be of ITER_FOLIOQ type.
-Each corresponds to a VM address space operation. These operations use the
-state in the per-inode context.
+High-Level VM API
+==================
-For ->readahead() and ->read_folio(), the network filesystem just point directly
-at the corresponding read helper; whereas for ->write_begin(), it may be a
-little more complicated as the network filesystem might want to flush
-conflicting writes or track dirty data and needs to put the acquired folio if
-an error occurs after calling the helper.
+Netfslib also provides a number of sets of API calls for the filesystem to
+delegate VM operations to. Again, netfslib, in turn, will call out to the
+filesystem and the cache to negotiate I/O sizes, issue RPCs and provide places
+for it to intervene at various times::
-The helpers manage the read request, calling back into the network filesystem
-through the supplied table of operations. Waits will be performed as
-necessary before returning for helpers that are meant to be synchronous.
+ void netfs_readahead(struct readahead_control *);
+ int netfs_read_folio(struct file *, struct folio *);
+ int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc);
+ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
+ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
+ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
-If an error occurs, the ->free_request() will be called to clean up the
-netfs_io_request struct allocated. If some parts of the request are in
-progress when an error occurs, the request will get partially completed if
-sufficient data is read.
+These are ``address_space_operations`` methods and can be set directly in the
+operations table.
-Additionally, there is::
+Deprecated PG_private_2 API
+---------------------------
- * void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
- ssize_t transferred_or_error,
- bool was_async);
+There is also a deprecated function for filesystems that still use the
+``->write_begin`` method::
-which should be called to complete a read subrequest. This is given the number
-of bytes transferred or a negative error code, plus a flag indicating whether
-the operation was asynchronous (ie. whether the follow-on processing can be
-done in the current context, given this may involve sleeping).
+ int netfs_write_begin(struct netfs_inode *inode, struct file *file,
+ struct address_space *mapping, loff_t pos, unsigned int len,
+ struct folio **_folio, void **_fsdata);
+It uses the deprecated PG_private_2 flag and so should not be used.
-Read Helper Structures
-----------------------
-The read helpers make use of a couple of structures to maintain the state of
-the read. The first is a structure that manages a read request as a whole::
+I/O Request API
+===============
+
+The I/O request API comprises a number of structures and a number of functions
+that the filesystem may need to use.
+
+Request Structure
+-----------------
+
+The request structure manages the request as a whole, holding some resources
+and state on behalf of the filesystem and tracking the collection of results::
struct netfs_io_request {
+ enum netfs_io_origin origin;
struct inode *inode;
struct address_space *mapping;
- struct netfs_cache_resources cache_resources;
+ struct netfs_group *group;
+ struct netfs_io_stream io_streams[];
void *netfs_priv;
- loff_t start;
- size_t len;
- loff_t i_size;
- const struct netfs_request_ops *netfs_ops;
+ void *netfs_priv2;
+ unsigned long long start;
+ unsigned long long len;
+ unsigned long long i_size;
unsigned int debug_id;
+ unsigned long flags;
...
};
-The above fields are the ones the netfs can use. They are:
+Many of the fields are for internal use, but the fields shown here are of
+interest to the filesystem:
+
+ * ``origin``
+
+ The origin of the request (readahead, read_folio, DIO read, writeback, ...).
* ``inode``
* ``mapping``
@@ -202,11 +524,19 @@ The above fields are the ones the netfs can use. They are:
The inode and the address space of the file being read from. The mapping
may or may not point to inode->i_data.
- * ``cache_resources``
+ * ``group``
+
+ The writeback group this request is dealing with or NULL. This holds a ref
+ on the group.
+
+ * ``io_streams``
- Resources for the local cache to use, if present.
+ The parallel streams of subrequests available to the request. Currently two
+ are available, but this may be made extensible in future. ``NR_IO_STREAMS``
+ indicates the size of the array.
* ``netfs_priv``
+ * ``netfs_priv2``
The network filesystem's private data. The value for this can be passed in
to the helper functions or set during the request.
@@ -221,37 +551,121 @@ The above fields are the ones the netfs can use. They are:
The size of the file at the start of the request.
- * ``netfs_ops``
-
- A pointer to the operation table. The value for this is passed into the
- helper functions.
-
* ``debug_id``
A number allocated to this operation that can be displayed in trace lines
for reference.
+ * ``flags``
+
+ Flags for managing and controlling the operation of the request. Some of
+ these may be of interest to the filesystem:
+
+ * ``NETFS_RREQ_RETRYING``
+
+ Netfslib sets this when generating retries.
+
+ * ``NETFS_RREQ_PAUSE``
+
+ The filesystem can set this to request to pause the library's subrequest
+ issuing loop - but care needs to be taken as netfslib may also set it.
+
+ * ``NETFS_RREQ_NONBLOCK``
+ * ``NETFS_RREQ_BLOCKED``
+
+ Netfslib sets the first to indicate that non-blocking mode was set by the
+ caller and the filesystem can set the second to indicate that it would
+ have had to block.
+
+ * ``NETFS_RREQ_USE_PGPRIV2``
+
+ The filesystem can set this if it wants to use PG_private_2 to track
+ whether a folio is being written to the cache. This is deprecated as
+ PG_private_2 is going to go away.
+
+If the filesystem wants more private data than is afforded by this structure,
+then it should wrap it and provide its own allocator.
+
+Stream Structure
+----------------
+
+A request is comprised of one or more parallel streams and each stream may be
+aimed at a different target.
+
+For read requests, only stream 0 is used. This can contain a mixture of
+subrequests aimed at different sources. For write requests, stream 0 is used
+for the server and stream 1 is used for the cache. For buffered writeback,
+stream 0 is not enabled unless a normal dirty folio is encountered, at which
+point ->begin_writeback() will be invoked and the filesystem can mark the
+stream available.
+
+The stream struct looks like::
+
+ struct netfs_io_stream {
+ unsigned char stream_nr;
+ bool avail;
+ size_t sreq_max_len;
+ unsigned int sreq_max_segs;
+ unsigned int submit_extendable_to;
+ ...
+ };
+
+A number of members are available for access/use by the filesystem:
+
+ * ``stream_nr``
+
+ The number of the stream within the request.
+
+ * ``avail``
+
+ True if the stream is available for use. The filesystem should set this on
+ stream zero if in ->begin_writeback().
+
+ * ``sreq_max_len``
+ * ``sreq_max_segs``
+
+ These are set by the filesystem or the cache in ->prepare_read() or
+ ->prepare_write() for each subrequest to indicate the maximum number of
+ bytes and, optionally, the maximum number of segments (if not 0) that that
+ subrequest can support.
+
+ * ``submit_extendable_to``
-The second structure is used to manage individual slices of the overall read
-request::
+ The size that a subrequest can be rounded up to beyond the EOF, given the
+ available buffer. This allows the cache to work out if it can do a DIO read
+ or write that straddles the EOF marker.
+
+Subrequest Structure
+--------------------
+
+Individual units of I/O are managed by the subrequest structure. These
+represent slices of the overall request and run independently::
struct netfs_io_subrequest {
struct netfs_io_request *rreq;
- loff_t start;
+ struct iov_iter io_iter;
+ unsigned long long start;
size_t len;
size_t transferred;
unsigned long flags;
+ short error;
unsigned short debug_index;
+ unsigned char stream_nr;
...
};
-Each subrequest is expected to access a single source, though the helpers will
+Each subrequest is expected to access a single source, though the library will
handle falling back from one source type to another. The members are:
* ``rreq``
A pointer to the read request.
+ * ``io_iter``
+
+ An I/O iterator representing a slice of the buffer to be read into or
+ written from.
+
* ``start``
* ``len``
@@ -260,241 +674,300 @@ handle falling back from one source type to another. The members are:
* ``transferred``
- The amount of data transferred so far of the length of this slice. The
- network filesystem or cache should start the operation this far into the
- slice. If a short read occurs, the helpers will call again, having updated
- this to reflect the amount read so far.
+ The amount of data transferred so far for this subrequest. This should be
+ added to with the length of the transfer made by this issuance of the
+ subrequest. If this is less than ``len`` then the subrequest may be
+ reissued to continue.
* ``flags``
- Flags pertaining to the read. There are two of interest to the filesystem
- or cache:
+ Flags for managing the subrequest. There are a number of interest to the
+ filesystem or cache:
+
+ * ``NETFS_SREQ_MADE_PROGRESS``
+
+ Set by the filesystem to indicates that at least one byte of data was read
+ or written.
+
+ * ``NETFS_SREQ_HIT_EOF``
+
+ The filesystem should set this if a read hit the EOF on the file (in which
+ case ``transferred`` should stop at the EOF). Netfslib may expand the
+ subrequest out to the size of the folio containing the EOF on the off
+ chance that a third party change happened or a DIO read may have asked for
+ more than is available. The library will clear any excess pagecache.
* ``NETFS_SREQ_CLEAR_TAIL``
- This can be set to indicate that the remainder of the slice, from
- transferred to len, should be cleared.
+ The filesystem can set this to indicate that the remainder of the slice,
+ from transferred to len, should be cleared. Do not set if HIT_EOF is set.
+
+ * ``NETFS_SREQ_NEED_RETRY``
+
+ The filesystem can set this to tell netfslib to retry the subrequest.
+
+ * ``NETFS_SREQ_BOUNDARY``
+
+ This can be set by the filesystem on a subrequest to indicate that it ends
+ at a boundary with the filesystem structure (e.g. at the end of a Ceph
+ object). It tells netfslib not to retile subrequests across it.
* ``NETFS_SREQ_SEEK_DATA_READ``
- This is a hint to the cache that it might want to try skipping ahead to
- the next data (ie. using SEEK_DATA).
+ This is a hint from netfslib to the cache that it might want to try
+ skipping ahead to the next data (ie. using SEEK_DATA).
+
+ * ``error``
+
+ This is for the filesystem to store result of the subrequest. It should be
+ set to 0 if successful and a negative error code otherwise.
* ``debug_index``
+ * ``stream_nr``
A number allocated to this slice that can be displayed in trace lines for
- reference.
+ reference and the number of the request stream that it belongs to.
+If necessary, the filesystem can get and put extra refs on the subrequest it is
+given::
-Read Helper Operations
-----------------------
+ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
+ enum netfs_sreq_ref_trace what);
+ void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
+ enum netfs_sreq_ref_trace what);
-The network filesystem must provide the read helpers with a table of operations
-through which it can issue requests and negotiate::
+using netfs trace codes to indicate the reason. Care must be taken, however,
+as once control of the subrequest is returned to netfslib, the same subrequest
+can be reissued/retried.
+
+Filesystem Methods
+------------------
+
+The filesystem sets a table of operations in ``netfs_inode`` for netfslib to
+use::
struct netfs_request_ops {
- void (*init_request)(struct netfs_io_request *rreq, struct file *file);
+ mempool_t *request_pool;
+ mempool_t *subrequest_pool;
+ int (*init_request)(struct netfs_io_request *rreq, struct file *file);
void (*free_request)(struct netfs_io_request *rreq);
+ void (*free_subrequest)(struct netfs_io_subrequest *rreq);
void (*expand_readahead)(struct netfs_io_request *rreq);
- bool (*clamp_length)(struct netfs_io_subrequest *subreq);
+ int (*prepare_read)(struct netfs_io_subrequest *subreq);
void (*issue_read)(struct netfs_io_subrequest *subreq);
- bool (*is_still_valid)(struct netfs_io_request *rreq);
- int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
- struct folio **foliop, void **_fsdata);
void (*done)(struct netfs_io_request *rreq);
+ void (*update_i_size)(struct inode *inode, loff_t i_size);
+ void (*post_modify)(struct inode *inode);
+ void (*begin_writeback)(struct netfs_io_request *wreq);
+ void (*prepare_write)(struct netfs_io_subrequest *subreq);
+ void (*issue_write)(struct netfs_io_subrequest *subreq);
+ void (*retry_request)(struct netfs_io_request *wreq,
+ struct netfs_io_stream *stream);
+ void (*invalidate_cache)(struct netfs_io_request *wreq);
};
-The operations are as follows:
-
- * ``init_request()``
+The table starts with a pair of optional pointers to memory pools from which
+requests and subrequests can be allocated. If these are not given, netfslib
+has default pools that it will use instead. If the filesystem wraps the netfs
+structs in its own larger structs, then it will need to use its own pools.
+Netfslib will allocate directly from the pools.
- [Optional] This is called to initialise the request structure. It is given
- the file for reference.
+The methods defined in the table are:
+ * ``init_request()``
* ``free_request()``
+ * ``free_subrequest()``
- [Optional] This is called as the request is being deallocated so that the
- filesystem can clean up any state it has attached there.
+ [Optional] A filesystem may implement these to initialise or clean up any
+ resources that it attaches to the request or subrequest.
* ``expand_readahead()``
[Optional] This is called to allow the filesystem to expand the size of a
- readahead read request. The filesystem gets to expand the request in both
- directions, though it's not permitted to reduce it as the numbers may
- represent an allocation already made. If local caching is enabled, it gets
- to expand the request first.
+ readahead request. The filesystem gets to expand the request in both
+ directions, though it must retain the initial region as that may represent
+ an allocation already made. If local caching is enabled, it gets to expand
+ the request first.
Expansion is communicated by changing ->start and ->len in the request
structure. Note that if any change is made, ->len must be increased by at
least as much as ->start is reduced.
- * ``clamp_length()``
-
- [Optional] This is called to allow the filesystem to reduce the size of a
- subrequest. The filesystem can use this, for example, to chop up a request
- that has to be split across multiple servers or to put multiple reads in
- flight.
-
- This should return 0 on success and an error code on error.
-
- * ``issue_read()``
+ * ``prepare_read()``
- [Required] The helpers use this to dispatch a subrequest to the server for
- reading. In the subrequest, ->start, ->len and ->transferred indicate what
- data should be read from the server.
+ [Optional] This is called to allow the filesystem to limit the size of a
+ subrequest. It may also limit the number of individual regions in iterator,
+ such as required by RDMA. This information should be set on stream zero in::
- There is no return value; the netfs_subreq_terminated() function should be
- called to indicate whether or not the operation succeeded and how much data
- it transferred. The filesystem also should not deal with setting folios
- uptodate, unlocking them or dropping their refs - the helpers need to deal
- with this as they have to coordinate with copying to the local cache.
+ rreq->io_streams[0].sreq_max_len
+ rreq->io_streams[0].sreq_max_segs
- Note that the helpers have the folios locked, but not pinned. It is
- possible to use the ITER_XARRAY iov iterator to refer to the range of the
- inode that is being operated upon without the need to allocate large bvec
- tables.
+ The filesystem can use this, for example, to chop up a request that has to
+ be split across multiple servers or to put multiple reads in flight.
- * ``is_still_valid()``
+ Zero should be returned on success and an error code otherwise.
- [Optional] This is called to find out if the data just read from the local
- cache is still valid. It should return true if it is still valid and false
- if not. If it's not still valid, it will be reread from the server.
+ * ``issue_read()``
- * ``check_write_begin()``
+ [Required] Netfslib calls this to dispatch a subrequest to the server for
+ reading. In the subrequest, ->start, ->len and ->transferred indicate what
+ data should be read from the server and ->io_iter indicates the buffer to be
+ used.
- [Optional] This is called from the netfs_write_begin() helper once it has
- allocated/grabbed the folio to be modified to allow the filesystem to flush
- conflicting state before allowing it to be modified.
+ There is no return value; the ``netfs_read_subreq_terminated()`` function
+ should be called to indicate that the subrequest completed either way.
+ ->error, ->transferred and ->flags should be updated before completing. The
+ termination can be done asynchronously.
- It may unlock and discard the folio it was given and set the caller's folio
- pointer to NULL. It should return 0 if everything is now fine (``*foliop``
- left set) or the op should be retried (``*foliop`` cleared) and any other
- error code to abort the operation.
+ Note: the filesystem must not deal with setting folios uptodate, unlocking
+ them or dropping their refs - the library deals with this as it may have to
+ stitch together the results of multiple subrequests that variously overlap
+ the set of folios.
- * ``done``
+ * ``done()``
- [Optional] This is called after the folios in the request have all been
+ [Optional] This is called after the folios in a read request have all been
unlocked (and marked uptodate if applicable).
+ * ``update_i_size()``
+
+ [Optional] This is invoked by netfslib at various points during the write
+ paths to ask the filesystem to update its idea of the file size. If not
+ given, netfslib will set i_size and i_blocks and update the local cache
+ cookie.
+
+ * ``post_modify()``
+
+ [Optional] This is called after netfslib writes to the pagecache or when it
+ allows an mmap'd page to be marked as writable.
+
+ * ``begin_writeback()``
+
+ [Optional] Netfslib calls this when processing a writeback request if it
+ finds a dirty page that isn't simply marked NETFS_FOLIO_COPY_TO_CACHE,
+ indicating it must be written to the server. This allows the filesystem to
+ only set up writeback resources when it knows it's going to have to perform
+ a write.
+
+ * ``prepare_write()``
+ [Optional] This is called to allow the filesystem to limit the size of a
+ subrequest. It may also limit the number of individual regions in iterator,
+ such as required by RDMA. This information should be set on stream to which
+ the subrequest belongs::
-Read Helper Procedure
----------------------
-
-The read helpers work by the following general procedure:
-
- * Set up the request.
-
- * For readahead, allow the local cache and then the network filesystem to
- propose expansions to the read request. This is then proposed to the VM.
- If the VM cannot fully perform the expansion, a partially expanded read will
- be performed, though this may not get written to the cache in its entirety.
-
- * Loop around slicing chunks off of the request to form subrequests:
-
- * If a local cache is present, it gets to do the slicing, otherwise the
- helpers just try to generate maximal slices.
-
- * The network filesystem gets to clamp the size of each slice if it is to be
- the source. This allows rsize and chunking to be implemented.
+ rreq->io_streams[subreq->stream_nr].sreq_max_len
+ rreq->io_streams[subreq->stream_nr].sreq_max_segs
- * The helpers issue a read from the cache or a read from the server or just
- clears the slice as appropriate.
+ The filesystem can use this, for example, to chop up a request that has to
+ be split across multiple servers or to put multiple writes in flight.
- * The next slice begins at the end of the last one.
+ This is not permitted to return an error. Instead, in the event of failure,
+ ``netfs_prepare_write_failed()`` must be called.
- * As slices finish being read, they terminate.
+ * ``issue_write()``
- * When all the subrequests have terminated, the subrequests are assessed and
- any that are short or have failed are reissued:
+ [Required] This is used to dispatch a subrequest to the server for writing.
+ In the subrequest, ->start, ->len and ->transferred indicate what data
+ should be written to the server and ->io_iter indicates the buffer to be
+ used.
- * Failed cache requests are issued against the server instead.
+ There is no return value; the ``netfs_write_subreq_terminated()`` function
+ should be called to indicate that the subrequest completed either way.
+ ->error, ->transferred and ->flags should be updated before completing. The
+ termination can be done asynchronously.
- * Failed server requests just fail.
+ Note: the filesystem must not deal with removing the dirty or writeback
+ marks on folios involved in the operation and should not take refs or pins
+ on them, but should leave retention to netfslib.
- * Short reads against either source will be reissued against that source
- provided they have transferred some more data:
+ * ``retry_request()``
- * The cache may need to skip holes that it can't do DIO from.
+ [Optional] Netfslib calls this at the beginning of a retry cycle. This
+ allows the filesystem to examine the state of the request, the subrequests
+ in the indicated stream and of its own data and make adjustments or
+ renegotiate resources.
+
+ * ``invalidate_cache()``
- * If NETFS_SREQ_CLEAR_TAIL was set, a short read will be cleared to the
- end of the slice instead of reissuing.
+ [Optional] This is called by netfslib to invalidate data stored in the local
+ cache in the event that writing to the local cache fails, providing updated
+ coherency data that netfs can't provide.
- * Once the data is read, the folios that have been fully read/cleared:
+Terminating a subrequest
+------------------------
- * Will be marked uptodate.
+When a subrequest completes, there are a number of functions that the cache or
+subrequest can call to inform netfslib of the status change. One function is
+provided to terminate a write subrequest at the preparation stage and acts
+synchronously:
- * If a cache is present, will be marked with PG_fscache.
+ * ``void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);``
- * Unlocked
+ Indicate that the ->prepare_write() call failed. The ``error`` field should
+ have been updated.
- * Any folios that need writing to the cache will then have DIO writes issued.
+Note that ->prepare_read() can return an error as a read can simply be aborted.
+Dealing with writeback failure is trickier.
- * Synchronous operations will wait for reading to be complete.
+The other functions are used for subrequests that got as far as being issued:
- * Writes to the cache will proceed asynchronously and the folios will have the
- PG_fscache mark removed when that completes.
+ * ``void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq);``
- * The request structures will be cleaned up when everything has completed.
+ Tell netfslib that a read subrequest has terminated. The ``error``,
+ ``flags`` and ``transferred`` fields should have been updated.
+ * ``void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error);``
-Read Helper Cache API
----------------------
+ Tell netfslib that a write subrequest has terminated. Either the amount of
+ data processed or the negative error code can be passed in. This is
+ can be used as a kiocb completion function.
-When implementing a local cache to be used by the read helpers, two things are
-required: some way for the network filesystem to initialise the caching for a
-read request and a table of operations for the helpers to call.
+ * ``void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq);``
-To begin a cache operation on an fscache object, the following function is
-called::
+ This is provided to optionally update netfslib on the incremental progress
+ of a read, allowing some folios to be unlocked early and does not actually
+ terminate the subrequest. The ``transferred`` field should have been
+ updated.
- int fscache_begin_read_operation(struct netfs_io_request *rreq,
- struct fscache_cookie *cookie);
+Local Cache API
+---------------
-passing in the request pointer and the cookie corresponding to the file. This
-fills in the cache resources mentioned below.
+Netfslib provides a separate API for a local cache to implement, though it
+provides some somewhat similar routines to the filesystem request API.
-The netfs_io_request object contains a place for the cache to hang its
+Firstly, the netfs_io_request object contains a place for the cache to hang its
state::
struct netfs_cache_resources {
const struct netfs_cache_ops *ops;
void *cache_priv;
void *cache_priv2;
+ unsigned int debug_id;
+ unsigned int inval_counter;
};
-This contains an operations table pointer and two private pointers. The
-operation table looks like the following::
+This contains an operations table pointer and two private pointers plus the
+debug ID of the fscache cookie for tracing purposes and an invalidation counter
+that is cranked by calls to ``fscache_invalidate()`` allowing cache subrequests
+to be invalidated after completion.
+
+The cache operation table looks like the following::
struct netfs_cache_ops {
void (*end_operation)(struct netfs_cache_resources *cres);
-
void (*expand_readahead)(struct netfs_cache_resources *cres,
loff_t *_start, size_t *_len, loff_t i_size);
-
enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq,
- loff_t i_size);
-
+ loff_t i_size);
int (*read)(struct netfs_cache_resources *cres,
loff_t start_pos,
struct iov_iter *iter,
bool seek_data,
netfs_io_terminated_t term_func,
void *term_func_priv);
-
- int (*prepare_write)(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet);
-
- int (*write)(struct netfs_cache_resources *cres,
- loff_t start_pos,
- struct iov_iter *iter,
- netfs_io_terminated_t term_func,
- void *term_func_priv);
-
- int (*query_occupancy)(struct netfs_cache_resources *cres,
- loff_t start, size_t len, size_t granularity,
- loff_t *_data_start, size_t *_data_len);
+ void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq);
+ void (*issue_write)(struct netfs_io_subrequest *subreq);
};
With a termination handler function pointer::
@@ -511,10 +984,16 @@ The methods defined in the table are:
* ``expand_readahead()``
- [Optional] Called at the beginning of a netfs_readahead() operation to allow
- the cache to expand a request in either direction. This allows the cache to
+ [Optional] Called at the beginning of a readahead operation to allow the
+ cache to expand a request in either direction. This allows the cache to
size the request appropriately for the cache granularity.
+ * ``prepare_read()``
+
+ [Required] Called to configure the next slice of a request. ->start and
+ ->len in the subrequest indicate where and how big the next slice can be;
+ the cache gets to reduce the length to match its granularity requirements.
+
The function is passed pointers to the start and length in its parameters,
plus the size of the file for reference, and adjusts the start and length
appropriately. It should return one of:
@@ -528,12 +1007,6 @@ The methods defined in the table are:
downloaded from the server or read from the cache - or whether slicing
should be given up at the current point.
- * ``prepare_read()``
-
- [Required] Called to configure the next slice of a request. ->start and
- ->len in the subrequest indicate where and how big the next slice can be;
- the cache gets to reduce the length to match its granularity requirements.
-
* ``read()``
[Required] Called to read from the cache. The start file offset is given
@@ -547,44 +1020,33 @@ The methods defined in the table are:
indicating whether the termination is definitely happening in the caller's
context.
- * ``prepare_write()``
+ * ``prepare_write_subreq()``
- [Required] Called to prepare a write to the cache to take place. This
- involves checking to see whether the cache has sufficient space to honour
- the write. ``*_start`` and ``*_len`` indicate the region to be written; the
- region can be shrunk or it can be expanded to a page boundary either way as
- necessary to align for direct I/O. i_size holds the size of the object and
- is provided for reference. no_space_allocated_yet is set to true if the
- caller is certain that no data has been written to that region - for example
- if it tried to do a read from there already.
+ [Required] This is called to allow the cache to limit the size of a
+ subrequest. It may also limit the number of individual regions in iterator,
+ such as required by DIO/DMA. This information should be set on stream to
+ which the subrequest belongs::
- * ``write()``
+ rreq->io_streams[subreq->stream_nr].sreq_max_len
+ rreq->io_streams[subreq->stream_nr].sreq_max_segs
- [Required] Called to write to the cache. The start file offset is given
- along with an iterator to write from, which gives the length also.
-
- Also provided is a pointer to a termination handler function and private
- data to pass to that function. The termination function should be called
- with the number of bytes transferred or an error code, plus a flag
- indicating whether the termination is definitely happening in the caller's
- context.
+ The filesystem can use this, for example, to chop up a request that has to
+ be split across multiple servers or to put multiple writes in flight.
- * ``query_occupancy()``
+ This is not permitted to return an error. In the event of failure,
+ ``netfs_prepare_write_failed()`` must be called.
- [Required] Called to find out where the next piece of data is within a
- particular region of the cache. The start and length of the region to be
- queried are passed in, along with the granularity to which the answer needs
- to be aligned. The function passes back the start and length of the data,
- if any, available within that region. Note that there may be a hole at the
- front.
+ * ``issue_write()``
- It returns 0 if some data was found, -ENODATA if there was no usable data
- within the region or -ENOBUFS if there is no caching on this file.
+ [Required] This is used to dispatch a subrequest to the cache for writing.
+ In the subrequest, ->start, ->len and ->transferred indicate what data
+ should be written to the cache and ->io_iter indicates the buffer to be
+ used.
-Note that these methods are passed a pointer to the cache resource structure,
-not the read request structure as they could be used in other situations where
-there isn't a read request structure as well, such as writing dirty data to the
-cache.
+ There is no return value; the ``netfs_write_subreq_terminated()`` function
+ should be called to indicate that the subrequest completed either way.
+ ->error, ->transferred and ->flags should be updated before completing. The
+ termination can be done asynchronously.
API Function Reference
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 767b2927c762..3111ef5592f3 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1203,3 +1203,43 @@ should use d_drop();d_splice_alias() and return the result of the latter.
If a positive dentry cannot be returned for some reason, in-kernel
clients such as cachefiles, nfsd, smb/server may not perform ideally but
will fail-safe.
+
+---
+
+** mandatory**
+
+lookup_one(), lookup_one_unlocked(), lookup_one_positive_unlocked() now
+take a qstr instead of a name and len. These, not the "one_len"
+versions, should be used whenever accessing a filesystem from outside
+that filesysmtem, through a mount point - which will have a mnt_idmap.
+
+---
+
+** mandatory**
+
+Functions try_lookup_one_len(), lookup_one_len(),
+lookup_one_len_unlocked() and lookup_positive_unlocked() have been
+renamed to try_lookup_noperm(), lookup_noperm(),
+lookup_noperm_unlocked(), lookup_noperm_positive_unlocked(). They now
+take a qstr instead of separate name and length. QSTR() can be used
+when strlen() is needed for the length.
+
+For try_lookup_noperm() a reference to the qstr is passed in case the
+hash might subsequently be needed.
+
+These function no longer do any permission checking - they previously
+checked that the caller has 'X' permission on the parent. They must
+ONLY be used internally by a filesystem on itself when it knows that
+permissions are irrelevant or in a context where permission checks have
+already been performed such as after vfs_path_parent_lookup()
+
+---
+
+** mandatory**
+
+d_hash_and_lookup() is no longer exported or available outside the VFS.
+Use try_lookup_noperm() instead. This adds name validation and takes
+arguments in the opposite order but is otherwise identical.
+
+Using try_lookup_noperm() will require linux/namei.h to be included.
+
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index ae79c30b6c0c..bf051c7da6b8 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -716,9 +716,8 @@ page lookup by address, and keeping track of pages tagged as Dirty or
Writeback.
The first can be used independently to the others. The VM can try to
-either write dirty pages in order to clean them, or release clean pages
-in order to reuse them. To do this it can call the ->writepage method
-on dirty pages, and ->release_folio on clean folios with the private
+release clean pages in order to reuse them. To do this it can call
+->release_folio on clean folios with the private
flag set. Clean pages without PagePrivate and with no external references
will be released without notice being given to the address_space.
@@ -731,8 +730,8 @@ maintains information about the PG_Dirty and PG_Writeback status of each
page, so that pages with either of these flags can be found quickly.
The Dirty tag is primarily used by mpage_writepages - the default
-->writepages method. It uses the tag to find dirty pages to call
-->writepage on. If mpage_writepages is not used (i.e. the address
+->writepages method. It uses the tag to find dirty pages to
+write back. If mpage_writepages is not used (i.e. the address
provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost
unused. write_inode_now and sync_inode do use it (through
__sync_single_inode) to check if ->writepages has been successful in
@@ -756,23 +755,23 @@ pages, however the address_space has finer control of write sizes.
The read process essentially only requires 'read_folio'. The write
process is more complicated and uses write_begin/write_end or
-dirty_folio to write data into the address_space, and writepage and
+dirty_folio to write data into the address_space, and
writepages to writeback data to storage.
Adding and removing pages to/from an address_space is protected by the
inode's i_mutex.
When data is written to a page, the PG_Dirty flag should be set. It
-typically remains set until writepage asks for it to be written. This
+typically remains set until writepages asks for it to be written. This
should clear PG_Dirty and set PG_Writeback. It can be actually written
at any point after PG_Dirty is clear. Once it is known to be safe,
PG_Writeback is cleared.
Writeback makes use of a writeback_control structure to direct the
-operations. This gives the writepage and writepages operations some
+operations. This gives the writepages operation some
information about the nature of and reason for the writeback request,
and the constraints under which it is being done. It is also used to
-return information back to the caller about the result of a writepage or
+return information back to the caller about the result of a
writepages request.
@@ -819,7 +818,6 @@ cache in your filesystem. The following members are defined:
.. code-block:: c
struct address_space_operations {
- int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*read_folio)(struct file *, struct folio *);
int (*writepages)(struct address_space *, struct writeback_control *);
bool (*dirty_folio)(struct address_space *, struct folio *);
@@ -848,25 +846,6 @@ cache in your filesystem. The following members are defined:
int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};
-``writepage``
- called by the VM to write a dirty page to backing store. This
- may happen for data integrity reasons (i.e. 'sync'), or to free
- up memory (flush). The difference can be seen in
- wbc->sync_mode. The PG_Dirty flag has been cleared and
- PageLocked is true. writepage should start writeout, should set
- PG_Writeback, and should make sure the page is unlocked, either
- synchronously or asynchronously when the write operation
- completes.
-
- If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to
- try too hard if there are problems, and may choose to write out
- other pages from the mapping if that is easier (e.g. due to
- internal dependencies). If it chooses not to start writeout, it
- should return AOP_WRITEPAGE_ACTIVATE so that the VM will not
- keep calling ->writepage on that page.
-
- See the file "Locking" for more details.
-
``read_folio``
Called by the page cache to read a folio from the backing store.
The 'file' argument supplies authentication information to network
@@ -909,7 +888,7 @@ cache in your filesystem. The following members are defined:
given and that many pages should be written if possible. If no
->writepages is given, then mpage_writepages is used instead.
This will choose pages from the address space that are tagged as
- DIRTY and will pass them to ->writepage.
+ DIRTY and will write them back.
``dirty_folio``
called by the VM to mark a folio as dirty. This is particularly
diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml
index aacccea5dfe4..953aa837958b 100644
--- a/Documentation/netlink/specs/tc.yaml
+++ b/Documentation/netlink/specs/tc.yaml
@@ -2017,7 +2017,8 @@ attribute-sets:
attributes:
-
name: act
- type: nest
+ type: indexed-array
+ sub-type: nest
nested-attributes: tc-act-attrs
-
name: police
@@ -2250,7 +2251,8 @@ attribute-sets:
attributes:
-
name: act
- type: nest
+ type: indexed-array
+ sub-type: nest
nested-attributes: tc-act-attrs
-
name: police
@@ -2745,7 +2747,7 @@ attribute-sets:
type: u16
byte-order: big-endian
-
- name: key-l2-tpv3-sid
+ name: key-l2tpv3-sid
type: u32
byte-order: big-endian
-
@@ -3504,7 +3506,7 @@ attribute-sets:
name: rate64
type: u64
-
- name: prate4
+ name: prate64
type: u64
-
name: burst
diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst
index b8fef8101176..7aabead90648 100644
--- a/Documentation/networking/timestamping.rst
+++ b/Documentation/networking/timestamping.rst
@@ -811,11 +811,9 @@ Documentation/devicetree/bindings/ptp/timestamper.txt for more details.
3.2.4 Other caveats for MAC drivers
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Stacked PHCs, especially DSA (but not only) - since that doesn't require any
-modification to MAC drivers, so it is more difficult to ensure correctness of
-all possible code paths - is that they uncover bugs which were impossible to
-trigger before the existence of stacked PTP clocks. One example has to do with
-this line of code, already presented earlier::
+The use of stacked PHCs may uncover MAC driver bugs which were impossible to
+trigger without them. One example has to do with this line of code, already
+presented earlier::
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
diff --git a/MAINTAINERS b/MAINTAINERS
index 3563492e4eba..9fbe96a8c057 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10147,6 +10147,13 @@ F: drivers/gpio/gpio-regmap.c
F: include/linux/gpio/regmap.h
K: (devm_)?gpio_regmap_(un)?register
+GPIO SLOPPY LOGIC ANALYZER
+M: Wolfram Sang <wsa+renesas@sang-engineering.com>
+S: Supported
+F: Documentation/dev-tools/gpio-sloppy-logic-analyzer.rst
+F: drivers/gpio/gpio-sloppy-logic-analyzer.c
+F: tools/gpio/gpio-sloppy-logic-analyzer.sh
+
GPIO SUBSYSTEM
M: Linus Walleij <linus.walleij@linaro.org>
M: Bartosz Golaszewski <brgl@bgdev.pl>
@@ -11099,6 +11106,14 @@ L: linuxppc-dev@lists.ozlabs.org
S: Odd Fixes
F: drivers/tty/hvc/
+HUNG TASK DETECTOR
+M: Andrew Morton <akpm@linux-foundation.org>
+R: Lance Yang <lance.yang@linux.dev>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+F: include/linux/hung_task.h
+F: kernel/hung_task.c
+
I2C ACPI SUPPORT
M: Mika Westerberg <westeri@kernel.org>
L: linux-i2c@vger.kernel.org
@@ -15542,6 +15557,53 @@ S: Maintained
F: include/linux/execmem.h
F: mm/execmem.c
+MEMORY MANAGEMENT - GUP (GET USER PAGES)
+M: Andrew Morton <akpm@linux-foundation.org>
+M: David Hildenbrand <david@redhat.com>
+R: Jason Gunthorpe <jgg@nvidia.com>
+R: John Hubbard <jhubbard@nvidia.com>
+R: Peter Xu <peterx@redhat.com>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: mm/gup.c
+
+MEMORY MANAGEMENT - KSM (Kernel Samepage Merging)
+M: Andrew Morton <akpm@linux-foundation.org>
+M: David Hildenbrand <david@redhat.com>
+R: Xu Xin <xu.xin16@zte.com.cn>
+R: Chengming Zhou <chengming.zhou@linux.dev>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: Documentation/admin-guide/mm/ksm.rst
+F: Documentation/mm/ksm.rst
+F: include/linux/ksm.h
+F: include/trace/events/ksm.h
+F: mm/ksm.c
+
+MEMORY MANAGEMENT - MEMORY POLICY AND MIGRATION
+M: Andrew Morton <akpm@linux-foundation.org>
+M: David Hildenbrand <david@redhat.com>
+R: Zi Yan <ziy@nvidia.com>
+R: Matthew Brost <matthew.brost@intel.com>
+R: Joshua Hahn <joshua.hahnjy@gmail.com>
+R: Rakie Kim <rakie.kim@sk.com>
+R: Byungchul Park <byungchul@sk.com>
+R: Gregory Price <gourry@gourry.net>
+R: Ying Huang <ying.huang@linux.alibaba.com>
+L: linux-mm@kvack.org
+S: Maintained
+W: http://www.linux-mm.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
+F: include/linux/mempolicy.h
+F: include/linux/migrate.h
+F: mm/mempolicy.c
+F: mm/migrate.c
+F: mm/migrate_device.c
+
MEMORY MANAGEMENT - NUMA MEMBLOCKS AND NUMA EMULATION
M: Andrew Morton <akpm@linux-foundation.org>
M: Mike Rapoport <rppt@kernel.org>
@@ -15554,7 +15616,7 @@ F: mm/numa_memblks.c
MEMORY MANAGEMENT - PAGE ALLOCATOR
M: Andrew Morton <akpm@linux-foundation.org>
-R: Vlastimil Babka <vbabka@suse.cz>
+M: Vlastimil Babka <vbabka@suse.cz>
R: Suren Baghdasaryan <surenb@google.com>
R: Michal Hocko <mhocko@suse.com>
R: Brendan Jackman <jackmanb@google.com>
@@ -15562,10 +15624,25 @@ R: Johannes Weiner <hannes@cmpxchg.org>
R: Zi Yan <ziy@nvidia.com>
L: linux-mm@kvack.org
S: Maintained
+F: include/linux/compaction.h
+F: include/linux/gfp.h
+F: include/linux/page-isolation.h
F: mm/compaction.c
F: mm/page_alloc.c
-F: include/linux/gfp.h
-F: include/linux/compaction.h
+F: mm/page_isolation.c
+
+MEMORY MANAGEMENT - RECLAIM
+M: Andrew Morton <akpm@linux-foundation.org>
+M: Johannes Weiner <hannes@cmpxchg.org>
+R: David Hildenbrand <david@redhat.com>
+R: Michal Hocko <mhocko@kernel.org>
+R: Qi Zheng <zhengqi.arch@bytedance.com>
+R: Shakeel Butt <shakeel.butt@linux.dev>
+R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: mm/pt_reclaim.c
+F: mm/vmscan.c
MEMORY MANAGEMENT - RMAP (REVERSE MAPPING)
M: Andrew Morton <akpm@linux-foundation.org>
@@ -18432,7 +18509,7 @@ F: include/uapi/linux/ppdev.h
PARAVIRT_OPS INTERFACE
M: Juergen Gross <jgross@suse.com>
R: Ajay Kaher <ajay.kaher@broadcom.com>
-R: Alexey Makhalov <alexey.amakhalov@broadcom.com>
+R: Alexey Makhalov <alexey.makhalov@broadcom.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: virtualization@lists.linux.dev
L: x86@kernel.org
@@ -20594,8 +20671,8 @@ F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.yaml
F: drivers/i2c/busses/i2c-emev2.c
RENESAS ETHERNET AVB DRIVER
-M: Paul Barker <paul.barker.ct@bp.renesas.com>
M: Niklas Söderlund <niklas.soderlund@ragnatech.se>
+R: Paul Barker <paul@pbarker.dev>
L: netdev@vger.kernel.org
L: linux-renesas-soc@vger.kernel.org
S: Maintained
@@ -25893,7 +25970,7 @@ F: tools/testing/vsock/
VMALLOC
M: Andrew Morton <akpm@linux-foundation.org>
-R: Uladzislau Rezki <urezki@gmail.com>
+M: Uladzislau Rezki <urezki@gmail.com>
L: linux-mm@kvack.org
S: Maintained
W: http://www.linux-mm.org
@@ -25917,7 +25994,7 @@ F: drivers/misc/vmw_balloon.c
VMWARE HYPERVISOR INTERFACE
M: Ajay Kaher <ajay.kaher@broadcom.com>
-M: Alexey Makhalov <alexey.amakhalov@broadcom.com>
+M: Alexey Makhalov <alexey.makhalov@broadcom.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: virtualization@lists.linux.dev
L: x86@kernel.org
@@ -25945,7 +26022,7 @@ F: drivers/scsi/vmw_pvscsi.h
VMWARE VIRTUAL PTP CLOCK DRIVER
M: Nick Shi <nick.shi@broadcom.com>
R: Ajay Kaher <ajay.kaher@broadcom.com>
-R: Alexey Makhalov <alexey.amakhalov@broadcom.com>
+R: Alexey Makhalov <alexey.makhalov@broadcom.com>
R: Broadcom internal kernel review list <bcm-kernel-feedback-list@broadcom.com>
L: netdev@vger.kernel.org
S: Supported
@@ -26817,6 +26894,14 @@ L: linux-kernel@vger.kernel.org
S: Maintained
F: arch/x86/kernel/cpu/zhaoxin.c
+ZONED LOOP DEVICE
+M: Damien Le Moal <dlemoal@kernel.org>
+R: Christoph Hellwig <hch@lst.de>
+L: linux-block@vger.kernel.org
+S: Maintained
+F: Documentation/admin-guide/blockdev/zoned_loop.rst
+F: drivers/block/zloop.c
+
ZONEFS FILESYSTEM
M: Damien Le Moal <dlemoal@kernel.org>
M: Naohiro Aota <naohiro.aota@wdc.com>
diff --git a/Makefile b/Makefile
index 0eb9e6c49b32..c1cd1b5fc269 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 15
SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION =
NAME = Baby Opossum Posse
# *DOCUMENTATION*
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
index 13a0e63afeaf..2c64d834a2c4 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts
@@ -152,28 +152,12 @@
vcc-pg-supply = <&reg_aldo1>;
};
-&r_ir {
- linux,rc-map-name = "rc-beelink-gs1";
- status = "okay";
-};
-
-&r_pio {
- /*
- * FIXME: We can't add that supply for now since it would
- * create a circular dependency between pinctrl, the regulator
- * and the RSB Bus.
- *
- * vcc-pl-supply = <&reg_aldo1>;
- */
- vcc-pm-supply = <&reg_aldo1>;
-};
-
-&r_rsb {
+&r_i2c {
status = "okay";
- axp805: pmic@745 {
+ axp805: pmic@36 {
compatible = "x-powers,axp805", "x-powers,axp806";
- reg = <0x745>;
+ reg = <0x36>;
interrupt-parent = <&r_intc>;
interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
@@ -291,6 +275,22 @@
};
};
+&r_ir {
+ linux,rc-map-name = "rc-beelink-gs1";
+ status = "okay";
+};
+
+&r_pio {
+ /*
+ * PL0 and PL1 are used for PMIC I2C
+ * don't enable the pl-supply else
+ * it will fail at boot
+ *
+ * vcc-pl-supply = <&reg_aldo1>;
+ */
+ vcc-pm-supply = <&reg_aldo1>;
+};
+
&spdif {
pinctrl-names = "default";
pinctrl-0 = <&spdif_tx_pin>;
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
index ab87c3447cd7..f005072c68a1 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts
@@ -176,16 +176,12 @@
vcc-pg-supply = <&reg_vcc_wifi_io>;
};
-&r_ir {
- status = "okay";
-};
-
-&r_rsb {
+&r_i2c {
status = "okay";
- axp805: pmic@745 {
+ axp805: pmic@36 {
compatible = "x-powers,axp805", "x-powers,axp806";
- reg = <0x745>;
+ reg = <0x36>;
interrupt-parent = <&r_intc>;
interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
@@ -296,6 +292,10 @@
};
};
+&r_ir {
+ status = "okay";
+};
+
&rtc {
clocks = <&ext_osc32k>;
};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
index d05dc5d6e6b9..e34dbb992021 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi
@@ -113,20 +113,12 @@
vcc-pg-supply = <&reg_aldo1>;
};
-&r_ir {
- status = "okay";
-};
-
-&r_pio {
- vcc-pm-supply = <&reg_bldo3>;
-};
-
-&r_rsb {
+&r_i2c {
status = "okay";
- axp805: pmic@745 {
+ axp805: pmic@36 {
compatible = "x-powers,axp805", "x-powers,axp806";
- reg = <0x745>;
+ reg = <0x36>;
interrupt-parent = <&r_intc>;
interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_LOW>;
interrupt-controller;
@@ -241,6 +233,14 @@
};
};
+&r_ir {
+ status = "okay";
+};
+
+&r_pio {
+ vcc-pm-supply = <&reg_bldo3>;
+};
+
&rtc {
clocks = <&ext_osc32k>;
};
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi b/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
index 3a9b6907185d..242820845707 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-3720-uDPU.dtsi
@@ -26,6 +26,8 @@
leds {
compatible = "gpio-leds";
+ pinctrl-names = "default";
+ pinctrl-0 = <&spi_quad_pins>;
led-power1 {
label = "udpu:green:power";
@@ -82,8 +84,6 @@
&spi0 {
status = "okay";
- pinctrl-names = "default";
- pinctrl-0 = <&spi_quad_pins>;
flash@0 {
compatible = "jedec,spi-nor";
@@ -108,6 +108,10 @@
};
};
+&spi_quad_pins {
+ function = "gpio";
+};
+
&pinctrl_nb {
i2c2_recovery_pins: i2c2-recovery-pins {
groups = "i2c2";
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 5bb8f09422a2..c4ce2c67c0e0 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1729,12 +1729,12 @@ CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
CONFIG_SECURITY=y
CONFIG_CRYPTO_USER=y
+CONFIG_CRYPTO_CHACHA20=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_ECHAINIV=y
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_ANSI_CPRNG=y
CONFIG_CRYPTO_USER_API_RNG=m
-CONFIG_CRYPTO_CHACHA20_NEON=m
CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index a5b63c84f854..e5d21e836d99 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -55,7 +55,7 @@ static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long v
/* Query offset/name of register from its name/offset */
extern int regs_query_register_offset(const char *name);
-#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last))
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last) - sizeof(unsigned long))
/**
* regs_get_register() - get register value from its offset
diff --git a/arch/loongarch/include/asm/uprobes.h b/arch/loongarch/include/asm/uprobes.h
index 99a0d198927f..025fc3f0a102 100644
--- a/arch/loongarch/include/asm/uprobes.h
+++ b/arch/loongarch/include/asm/uprobes.h
@@ -15,7 +15,6 @@ typedef u32 uprobe_opcode_t;
#define UPROBE_XOLBP_INSN __emit_break(BRK_UPROBE_XOLBP)
struct arch_uprobe {
- unsigned long resume_era;
u32 insn[2];
u32 ixol[2];
bool simulate;
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 4f0912141781..733a7665e434 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -16,6 +16,7 @@
#include <asm/stackframe.h>
#include <asm/thread_info.h>
+ .section .cpuidle.text, "ax"
.align 5
SYM_FUNC_START(__arch_cpu_idle)
/* start of idle interrupt region */
@@ -31,14 +32,16 @@ SYM_FUNC_START(__arch_cpu_idle)
*/
idle 0
/* end of idle interrupt region */
-1: jr ra
+idle_exit:
+ jr ra
SYM_FUNC_END(__arch_cpu_idle)
+ .previous
SYM_CODE_START(handle_vint)
UNWIND_HINT_UNDEFINED
BACKUP_T0T1
SAVE_ALL
- la_abs t1, 1b
+ la_abs t1, idle_exit
LONG_L t0, sp, PT_ERA
/* 3 instructions idle interrupt region */
ori t0, t0, 0b1100
diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c
index ec5b28e570c9..4c476904227f 100644
--- a/arch/loongarch/kernel/kfpu.c
+++ b/arch/loongarch/kernel/kfpu.c
@@ -18,11 +18,28 @@ static unsigned int euen_mask = CSR_EUEN_FPEN;
static DEFINE_PER_CPU(bool, in_kernel_fpu);
static DEFINE_PER_CPU(unsigned int, euen_current);
+static inline void fpregs_lock(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ local_bh_disable();
+}
+
+static inline void fpregs_unlock(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ local_bh_enable();
+}
+
void kernel_fpu_begin(void)
{
unsigned int *euen_curr;
- preempt_disable();
+ if (!irqs_disabled())
+ fpregs_lock();
WARN_ON(this_cpu_read(in_kernel_fpu));
@@ -73,7 +90,8 @@ void kernel_fpu_end(void)
this_cpu_write(in_kernel_fpu, false);
- preempt_enable();
+ if (!irqs_disabled())
+ fpregs_unlock();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index e2d3bfeb6366..bc75a3a69fc8 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -111,7 +111,7 @@ static unsigned long __init get_loops_per_jiffy(void)
return lpj;
}
-static long init_offset __nosavedata;
+static long init_offset;
void save_counter(void)
{
diff --git a/arch/loongarch/kernel/uprobes.c b/arch/loongarch/kernel/uprobes.c
index 87abc7137b73..6022eb0f71db 100644
--- a/arch/loongarch/kernel/uprobes.c
+++ b/arch/loongarch/kernel/uprobes.c
@@ -42,7 +42,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
utask->autask.saved_trap_nr = current->thread.trap_nr;
current->thread.trap_nr = UPROBE_TRAP_NR;
instruction_pointer_set(regs, utask->xol_vaddr);
- user_enable_single_step(current);
return 0;
}
@@ -53,13 +52,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
current->thread.trap_nr = utask->autask.saved_trap_nr;
-
- if (auprobe->simulate)
- instruction_pointer_set(regs, auprobe->resume_era);
- else
- instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE);
-
- user_disable_single_step(current);
+ instruction_pointer_set(regs, utask->vaddr + LOONGARCH_INSN_SIZE);
return 0;
}
@@ -70,7 +63,6 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
current->thread.trap_nr = utask->autask.saved_trap_nr;
instruction_pointer_set(regs, utask->vaddr);
- user_disable_single_step(current);
}
bool arch_uprobe_xol_was_trapped(struct task_struct *t)
@@ -90,7 +82,6 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
insn.word = auprobe->insn[0];
arch_simulate_insn(insn, regs);
- auprobe->resume_era = regs->csr_era;
return true;
}
diff --git a/arch/loongarch/power/hibernate.c b/arch/loongarch/power/hibernate.c
index 1e0590542f98..e7b7346592cb 100644
--- a/arch/loongarch/power/hibernate.c
+++ b/arch/loongarch/power/hibernate.c
@@ -2,6 +2,7 @@
#include <asm/fpu.h>
#include <asm/loongson.h>
#include <asm/sections.h>
+#include <asm/time.h>
#include <asm/tlbflush.h>
#include <linux/suspend.h>
@@ -14,6 +15,7 @@ struct pt_regs saved_regs;
void save_processor_state(void)
{
+ save_counter();
saved_crmd = csr_read32(LOONGARCH_CSR_CRMD);
saved_prmd = csr_read32(LOONGARCH_CSR_PRMD);
saved_euen = csr_read32(LOONGARCH_CSR_EUEN);
@@ -26,6 +28,7 @@ void save_processor_state(void)
void restore_processor_state(void)
{
+ sync_counter();
csr_write32(saved_crmd, LOONGARCH_CSR_CRMD);
csr_write32(saved_prmd, LOONGARCH_CSR_PRMD);
csr_write32(saved_euen, LOONGARCH_CSR_EUEN);
diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig
index 110279a64aa4..60767811e34a 100644
--- a/arch/m68k/configs/amcore_defconfig
+++ b/arch/m68k/configs/amcore_defconfig
@@ -2,7 +2,6 @@ CONFIG_LOCALVERSION="amcore-002"
CONFIG_DEFAULT_HOSTNAME="amcore"
CONFIG_SYSVIPC=y
# CONFIG_FHANDLE is not set
-# CONFIG_USELIB is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
# CONFIG_AIO is not set
diff --git a/arch/mips/configs/gcw0_defconfig b/arch/mips/configs/gcw0_defconfig
index bc1ef66e3999..8b7ad877e07a 100644
--- a/arch/mips/configs/gcw0_defconfig
+++ b/arch/mips/configs/gcw0_defconfig
@@ -13,7 +13,6 @@ CONFIG_MIPS_CMDLINE_DTB_EXTEND=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
-# CONFIG_BOUNCE is not set
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
diff --git a/arch/mips/include/asm/socket.h b/arch/mips/include/asm/socket.h
index 4724a563c5bf..43a09f0dd3ff 100644
--- a/arch/mips/include/asm/socket.h
+++ b/arch/mips/include/asm/socket.h
@@ -36,15 +36,6 @@ enum sock_type {
SOCK_PACKET = 10,
};
-#define SOCK_MAX (SOCK_PACKET + 1)
-/* Mask which covers at least up to SOCK_MASK-1. The
- * * remaining bits are used as flags. */
-#define SOCK_TYPE_MASK 0xf
-
-/* Flags for socket, socketpair, paccept */
-#define SOCK_CLOEXEC O_CLOEXEC
-#define SOCK_NONBLOCK O_NONBLOCK
-
#define ARCH_HAS_SOCKET_TYPES 1
#endif /* _ASM_SOCKET_H */
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 04ea1c03a5ff..96409573c75d 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -342,7 +342,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name,
struct inode *inode;
inode_lock(d_inode(parent));
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry)) {
dentry = ERR_PTR(-ENOMEM);
goto fail;
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index b0c1a7a57497..36beaac713c1 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -959,6 +959,102 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
}
+static int vmgexit_ap_control(u64 event, struct sev_es_save_area *vmsa, u32 apic_id)
+{
+ bool create = event != SVM_VMGEXIT_AP_DESTROY;
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+
+ if (create)
+ ghcb_set_rax(ghcb, vmsa->sev_features);
+
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+ ghcb_set_sw_exit_info_1(ghcb,
+ ((u64)apic_id << 32) |
+ ((u64)snp_vmpl << 16) |
+ event);
+ ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ lower_32_bits(ghcb->save.sw_exit_info_1)) {
+ pr_err("SNP AP %s error\n", (create ? "CREATE" : "DESTROY"));
+ ret = -EINVAL;
+ }
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
+{
+ int ret;
+
+ if (snp_vmpl) {
+ struct svsm_call call = {};
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ call.caa = this_cpu_read(svsm_caa);
+ call.rcx = __pa(va);
+
+ if (make_vmsa) {
+ /* Protocol 0, Call ID 2 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
+ call.rdx = __pa(caa);
+ call.r8 = apic_id;
+ } else {
+ /* Protocol 0, Call ID 3 */
+ call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
+ }
+
+ ret = svsm_perform_call_protocol(&call);
+
+ local_irq_restore(flags);
+ } else {
+ /*
+ * If the kernel runs at VMPL0, it can change the VMSA
+ * bit for a page using the RMPADJUST instruction.
+ * However, for the instruction to succeed it must
+ * target the permissions of a lesser privileged (higher
+ * numbered) VMPL level, so use VMPL1.
+ */
+ u64 attrs = 1;
+
+ if (make_vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+ }
+
+ return ret;
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, NULL, apic_id, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
static void set_pte_enc(pte_t *kpte, int level, void *va)
{
struct pte_enc_desc d = {
@@ -1005,7 +1101,8 @@ static void unshare_all_memory(void)
data = per_cpu(runtime_data, cpu);
ghcb = (unsigned long)&data->ghcb_page;
- if (addr <= ghcb && ghcb <= addr + size) {
+ /* Handle the case of a huge page containing the GHCB page */
+ if (addr <= ghcb && ghcb < addr + size) {
skipped_addr = true;
break;
}
@@ -1055,11 +1152,70 @@ void snp_kexec_begin(void)
pr_warn("Failed to stop shared<->private conversions\n");
}
+/*
+ * Shutdown all APs except the one handling kexec/kdump and clearing
+ * the VMSA tag on AP's VMSA pages as they are not being used as
+ * VMSA page anymore.
+ */
+static void shutdown_all_aps(void)
+{
+ struct sev_es_save_area *vmsa;
+ int apic_id, this_cpu, cpu;
+
+ this_cpu = get_cpu();
+
+ /*
+ * APs are already in HLT loop when enc_kexec_finish() callback
+ * is invoked.
+ */
+ for_each_present_cpu(cpu) {
+ vmsa = per_cpu(sev_vmsa, cpu);
+
+ /*
+ * The BSP or offlined APs do not have guest allocated VMSA
+ * and there is no need to clear the VMSA tag for this page.
+ */
+ if (!vmsa)
+ continue;
+
+ /*
+ * Cannot clear the VMSA tag for the currently running vCPU.
+ */
+ if (this_cpu == cpu) {
+ unsigned long pa;
+ struct page *p;
+
+ pa = __pa(vmsa);
+ /*
+ * Mark the VMSA page of the running vCPU as offline
+ * so that is excluded and not touched by makedumpfile
+ * while generating vmcore during kdump.
+ */
+ p = pfn_to_online_page(pa >> PAGE_SHIFT);
+ if (p)
+ __SetPageOffline(p);
+ continue;
+ }
+
+ apic_id = cpuid_to_apicid[cpu];
+
+ /*
+ * Issue AP destroy to ensure AP gets kicked out of guest mode
+ * to allow using RMPADJUST to remove the VMSA tag on it's
+ * VMSA page.
+ */
+ vmgexit_ap_control(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id);
+ snp_cleanup_vmsa(vmsa, apic_id);
+ }
+
+ put_cpu();
+}
+
void snp_kexec_finish(void)
{
struct sev_es_runtime_data *data;
+ unsigned long size, addr;
unsigned int level, cpu;
- unsigned long size;
struct ghcb *ghcb;
pte_t *pte;
@@ -1069,6 +1225,8 @@ void snp_kexec_finish(void)
if (!IS_ENABLED(CONFIG_KEXEC_CORE))
return;
+ shutdown_all_aps();
+
unshare_all_memory();
/*
@@ -1085,54 +1243,11 @@ void snp_kexec_finish(void)
ghcb = &data->ghcb_page;
pte = lookup_address((unsigned long)ghcb, &level);
size = page_level_size(level);
- set_pte_enc(pte, level, (void *)ghcb);
- snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
- }
-}
-
-static int snp_set_vmsa(void *va, void *caa, int apic_id, bool make_vmsa)
-{
- int ret;
-
- if (snp_vmpl) {
- struct svsm_call call = {};
- unsigned long flags;
-
- local_irq_save(flags);
-
- call.caa = this_cpu_read(svsm_caa);
- call.rcx = __pa(va);
-
- if (make_vmsa) {
- /* Protocol 0, Call ID 2 */
- call.rax = SVSM_CORE_CALL(SVSM_CORE_CREATE_VCPU);
- call.rdx = __pa(caa);
- call.r8 = apic_id;
- } else {
- /* Protocol 0, Call ID 3 */
- call.rax = SVSM_CORE_CALL(SVSM_CORE_DELETE_VCPU);
- }
-
- ret = svsm_perform_call_protocol(&call);
-
- local_irq_restore(flags);
- } else {
- /*
- * If the kernel runs at VMPL0, it can change the VMSA
- * bit for a page using the RMPADJUST instruction.
- * However, for the instruction to succeed it must
- * target the permissions of a lesser privileged (higher
- * numbered) VMPL level, so use VMPL1.
- */
- u64 attrs = 1;
-
- if (make_vmsa)
- attrs |= RMPADJUST_VMSA_PAGE_BIT;
-
- ret = rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+ /* Handle the case of a huge page containing the GHCB page */
+ addr = (unsigned long)ghcb & page_level_mask(level);
+ set_pte_enc(pte, level, (void *)addr);
+ snp_set_memory_private(addr, (size / PAGE_SIZE));
}
-
- return ret;
}
#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
@@ -1166,24 +1281,10 @@ static void *snp_alloc_vmsa_page(int cpu)
return page_address(p + 1);
}
-static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
-{
- int err;
-
- err = snp_set_vmsa(vmsa, NULL, apic_id, false);
- if (err)
- pr_err("clear VMSA page failed (%u), leaking page\n", err);
- else
- free_page((unsigned long)vmsa);
-}
-
static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
- struct ghcb_state state;
struct svsm_ca *caa;
- unsigned long flags;
- struct ghcb *ghcb;
u8 sipi_vector;
int cpu, ret;
u64 cr4;
@@ -1297,33 +1398,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
}
/* Issue VMGEXIT AP Creation NAE event */
- local_irq_save(flags);
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_rax(ghcb, vmsa->sev_features);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
- ghcb_set_sw_exit_info_1(ghcb,
- ((u64)apic_id << 32) |
- ((u64)snp_vmpl << 16) |
- SVM_VMGEXIT_AP_CREATE);
- ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
- lower_32_bits(ghcb->save.sw_exit_info_1)) {
- pr_err("SNP AP Creation error\n");
- ret = -EINVAL;
- }
-
- __sev_put_ghcb(&state);
-
- local_irq_restore(flags);
-
- /* Perform cleanup if there was an error */
+ ret = vmgexit_ap_control(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id);
if (ret) {
snp_cleanup_vmsa(vmsa, apic_id);
vmsa = NULL;
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 91801138b10b..7cd2f395f301 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,6 @@
CONFIG_WERROR=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_USELIB=y
CONFIG_AUDIT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9b20acc0e932..8d86e91bd5e5 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2465,8 +2465,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
setup_pebs_fixed_sample_data);
}
-static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
+static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask)
{
+ u64 pebs_enabled = cpuc->pebs_enabled & mask;
struct perf_event *event;
int bit;
@@ -2477,7 +2478,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
* It needs to call intel_pmu_save_and_restart_reload() to
* update the event->count for this case.
*/
- for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) {
+ for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) {
event = cpuc->events[bit];
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_save_and_restart_reload(event, 0);
@@ -2512,7 +2513,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
}
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, size);
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
return;
}
@@ -2626,7 +2627,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
(hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED);
if (unlikely(base >= top)) {
- intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX);
+ intel_pmu_pebs_event_update_no_drain(cpuc, mask);
return;
}
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 39e61212ac9a..30144ef9ef02 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -75,7 +75,7 @@
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
-/* Free ( 3*32+ 6) */
+#define X86_FEATURE_ZEN6 ( 3*32+ 6) /* CPU based on Zen6 microarchitecture */
/* Free ( 3*32+ 7) */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index acb85b9346d8..0020d77a0800 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -116,7 +116,7 @@ enum psc_op {
#define GHCB_MSR_VMPL_REQ 0x016
#define GHCB_MSR_VMPL_REQ_LEVEL(v) \
/* GHCBData[39:32] */ \
- (((u64)(v) & GENMASK_ULL(7, 0) << 32) | \
+ ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \
/* GHCBDdata[11:0] */ \
GHCB_MSR_VMPL_REQ)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2b36379ff675..4e06baab40bb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -472,6 +472,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
case 0x60 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
break;
+ case 0x50 ... 0x5f:
+ case 0x90 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
default:
goto warn;
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7fa3965dcef1..bb8d99e717b9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -566,7 +566,7 @@ static void __init lowmem_pfn_init(void)
"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
#define MSG_HIGHMEM_TRIMMED \
- "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n"
+ "Warning: only 4GB will be used. Support for CONFIG_HIGHMEM64G was removed!\n"
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig
index 91c4c4cae8a7..49f50d1bd724 100644
--- a/arch/xtensa/configs/cadence_csp_defconfig
+++ b/arch/xtensa/configs/cadence_csp_defconfig
@@ -1,6 +1,5 @@
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
-CONFIG_USELIB=y
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_IRQ_TIME_ACCOUNTING=y
diff --git a/block/Kconfig b/block/Kconfig
index df8973bc0539..15027963472d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -211,14 +211,6 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
source "block/partitions/Kconfig"
-config BLK_MQ_PCI
- def_bool PCI
-
-config BLK_MQ_VIRTIO
- bool
- depends on VIRTIO
- default y
-
config BLK_PM
def_bool PM
diff --git a/block/Makefile b/block/Makefile
index 3a941dc0d27f..c65f4da93702 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,13 +5,12 @@
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-merge.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
+ blk-merge.o blk-timeout.o blk-lib.o blk-mq.o \
+ blk-mq-tag.o blk-mq-dma.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o blk-ia-ranges.o early-lookup.o
-obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index abd80dc13562..0cb1e9873aab 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7210,8 +7210,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
- clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
- wbt_enable_default(bfqd->queue->disk);
+ blk_queue_flag_clear(QUEUE_FLAG_DISABLE_WBT_DEF, bfqd->queue);
+ set_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &e->flags);
kfree(bfqd);
}
@@ -7397,7 +7397,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
- set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
+ blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
wbt_disable_default(q->disk);
blk_stat_enable_accounting(q);
diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index e524c609be50..9c6657664792 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -9,6 +9,7 @@
* not aware of PI.
*/
#include <linux/blk-integrity.h>
+#include <linux/t10-pi.h>
#include <linux/workqueue.h>
#include "blk.h"
@@ -43,6 +44,29 @@ static void bio_integrity_verify_fn(struct work_struct *work)
bio_endio(bio);
}
+#define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
+static bool bip_should_check(struct bio_integrity_payload *bip)
+{
+ return bip->bip_flags & BIP_CHECK_FLAGS;
+}
+
+static bool bi_offload_capable(struct blk_integrity *bi)
+{
+ switch (bi->csum_type) {
+ case BLK_INTEGRITY_CSUM_CRC64:
+ return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+ case BLK_INTEGRITY_CSUM_CRC:
+ case BLK_INTEGRITY_CSUM_IP:
+ return bi->tuple_size == sizeof(struct t10_pi_tuple);
+ default:
+ pr_warn_once("%s: unknown integrity checksum type:%d\n",
+ __func__, bi->csum_type);
+ fallthrough;
+ case BLK_INTEGRITY_CSUM_NONE:
+ return false;
+ }
+}
+
/**
* __bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
@@ -54,12 +78,12 @@ static void bio_integrity_verify_fn(struct work_struct *work)
*/
bool __bio_integrity_endio(struct bio *bio)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_data *bid =
container_of(bip, struct bio_integrity_data, bip);
- if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bi->csum_type) {
+ if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
+ bip_should_check(bip)) {
INIT_WORK(&bid->work, bio_integrity_verify_fn);
queue_work(kintegrityd_wq, &bid->work);
return false;
@@ -84,6 +108,7 @@ bool bio_integrity_prep(struct bio *bio)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_data *bid;
+ bool set_flags = true;
gfp_t gfp = GFP_NOIO;
unsigned int len;
void *buf;
@@ -100,19 +125,24 @@ bool bio_integrity_prep(struct bio *bio)
switch (bio_op(bio)) {
case REQ_OP_READ:
- if (bi->flags & BLK_INTEGRITY_NOVERIFY)
- return true;
+ if (bi->flags & BLK_INTEGRITY_NOVERIFY) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ }
break;
case REQ_OP_WRITE:
- if (bi->flags & BLK_INTEGRITY_NOGENERATE)
- return true;
-
/*
* Zero the memory allocated to not leak uninitialized kernel
* memory to disk for non-integrity metadata where nothing else
* initializes the memory.
*/
- if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+ if (bi->flags & BLK_INTEGRITY_NOGENERATE) {
+ if (bi_offload_capable(bi))
+ return true;
+ set_flags = false;
+ gfp |= __GFP_ZERO;
+ } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
gfp |= __GFP_ZERO;
break;
default:
@@ -137,19 +167,21 @@ bool bio_integrity_prep(struct bio *bio)
bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY;
bip_set_seed(&bid->bip, bio->bi_iter.bi_sector);
- if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
- bid->bip.bip_flags |= BIP_IP_CHECKSUM;
- if (bi->csum_type)
- bid->bip.bip_flags |= BIP_CHECK_GUARD;
- if (bi->flags & BLK_INTEGRITY_REF_TAG)
- bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ if (set_flags) {
+ if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
+ bid->bip.bip_flags |= BIP_IP_CHECKSUM;
+ if (bi->csum_type)
+ bid->bip.bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bid->bip.bip_flags |= BIP_CHECK_REFTAG;
+ }
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len)
goto err_end_io;
/* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
+ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip))
blk_integrity_generate(bio);
else
bid->saved_bio_iter = bio->bi_iter;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 43ef6bd06c85..cb94e9be26dc 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -127,10 +127,8 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
if (bip->bip_vcnt > 0) {
struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
- bool same_page = false;
- if (bvec_try_merge_hw_page(q, bv, page, len, offset,
- &same_page)) {
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
bip->bip_iter.bi_size += len;
return len;
}
diff --git a/block/bio.c b/block/bio.c
index 4e6c85a33d74..3c0a558c90f5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -611,7 +612,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
struct bio *bio;
- if (nr_vecs > UIO_MAXIOV)
+ if (nr_vecs > BIO_MAX_INLINE_VECS)
return NULL;
return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
@@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
@@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
}
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
+ unsigned int len, unsigned int off)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
- *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
- PAGE_MASK));
- if (!*same_page) {
+ if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
@@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
* helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page)
+ struct page *page, unsigned len, unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = bvec_phys(bv);
@@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
return false;
if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
- return bvec_try_merge_page(bv, page, len, offset, same_page);
+ return bvec_try_merge_page(bv, page, len, offset);
}
/**
@@ -990,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
+ * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
+ * @bio: destination bio
+ * @vaddr: data to add
+ * @len: length of the data to add, may cross pages
+ *
+ * Add the data at @vaddr to @bio. The caller must have ensure a segment
+ * is available for the added data. No merging into an existing segment
+ * will be performed.
+ */
+void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
+{
+ __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
+}
+EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
+
+/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
* @page: start page to add
@@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- bool same_page = false;
-
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_iter.bi_size > UINT_MAX - len)
@@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
+ page, len, offset)) {
bio->bi_iter.bi_size += len;
return len;
}
@@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
}
EXPORT_SYMBOL(bio_add_folio);
+/**
+ * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio and return how many bytes were added.
+ * This may be less than the amount originally asked. Returns 0 if no data
+ * could be added to @bio.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
+{
+ unsigned int offset = offset_in_page(vaddr);
+
+ len = min(len, PAGE_SIZE - offset);
+ if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
+ return 0;
+ if (op_is_write(bio_op(bio)))
+ flush_kernel_vmap_range(vaddr, len);
+ return len;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
+
+/**
+ * bio_add_vmalloc - add a vmalloc region to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio. Return %true on success or %false if
+ * @bio does not have enough space for the payload.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
+{
+ do {
+ unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
+
+ if (!added)
+ return false;
+ vaddr += added;
+ len -= added;
+ } while (len);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc);
+
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct folio_iter fi;
@@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
- size_t offset)
-{
- bool same_page = false;
-
- if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
- return -EIO;
-
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- folio_page(folio, 0), len, offset,
- &same_page)) {
- bio->bi_iter.bi_size += len;
- if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
- unpin_user_folio(folio, 1);
- return 0;
- }
- bio_add_folio_nofail(bio, folio, len, offset);
- return 0;
-}
-
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
@@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
+ unsigned int old_vcnt = bio->bi_vcnt;
folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;
@@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
- bio_iov_add_folio(bio, folio, len, folio_offset);
+ if (!bio_add_folio(bio, folio, len, folio_offset)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bio_flagged(bio, BIO_PAGE_PINNED)) {
+ /*
+ * We're adding another fragment of a page that already
+ * was part of the last segment. Undo our pin as the
+ * page was pinned when an earlier fragment of it was
+ * added to the bio and __bio_release_pages expects a
+ * single pin per page.
+ */
+ if (offset && bio->bi_vcnt == old_vcnt)
+ unpin_user_folio(folio, 1);
+ }
offset = 0;
}
@@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+/**
+ * bdev_rw_virt - synchronously read into / write from kernel mapping
+ * @bdev: block device to access
+ * @sector: sector to access
+ * @data: data to read/write
+ * @len: length in byte to read/write
+ * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
+ *
+ * Performs synchronous I/O to @bdev for @data/@len. @data must be in
+ * the kernel direct mapping and not a vmalloc address.
+ */
+int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
+ size_t len, enum req_op op)
+{
+ struct bio_vec bv;
+ struct bio bio;
+ int error;
+
+ if (WARN_ON_ONCE(is_vmalloc_addr(data)))
+ return -EIO;
+
+ bio_init(&bio, bdev, &bv, 1, op);
+ bio.bi_iter.bi_sector = sector;
+ bio_add_virt_nofail(&bio, data, len);
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+ return error;
+}
+EXPORT_SYMBOL_GPL(bdev_rw_virt);
+
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);
diff --git a/block/blk-core.c b/block/blk-core.c
index e8cc270a453f..b862c66018f2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1018,7 +1018,7 @@ again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp)) &&
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
- (end || part_in_flight(part)))
+ (end || bdev_count_inflight(part)))
__part_stat_add(part, io_ticks, now - stamp);
if (bdev_is_partition(part)) {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index f154be0b575a..005c9157ffb3 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/blk-map.c b/block/blk-map.c
index d2f22744b3d1..23e5d5ebe59e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -317,64 +317,26 @@ static void bio_map_kern_endio(struct bio *bio)
kfree(bio);
}
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-static struct bio *bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
+static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- bool is_vmalloc = is_vmalloc_addr(data);
- struct page *page;
- int offset, i;
+ unsigned int nr_vecs = bio_add_max_vecs(data, len);
struct bio *bio;
- bio = bio_kmalloc(nr_pages, gfp_mask);
+ bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
-
- if (is_vmalloc) {
- flush_kernel_vmap_range(data, len);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op);
+ if (is_vmalloc_addr(data)) {
bio->bi_private = data;
- }
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (!is_vmalloc)
- page = virt_to_page(data);
- else
- page = vmalloc_to_page(data);
- if (bio_add_page(bio, page, bytes, offset) < bytes) {
- /* we don't support partial mappings */
+ if (!bio_add_vmalloc(bio, data, len)) {
bio_uninit(bio);
kfree(bio);
return ERR_PTR(-EINVAL);
}
-
- data += bytes;
- len -= bytes;
- offset = 0;
+ } else {
+ bio_add_virt_nofail(bio, data, len);
}
-
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
@@ -402,17 +364,16 @@ static void bio_copy_kern_endio_read(struct bio *bio)
/**
* bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
* @data: pointer to buffer to copy
* @len: length in bytes
+ * @op: bio/request operation
* @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
*
* copy the kernel address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
-static struct bio *bio_copy_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask, int reading)
+static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op,
+ gfp_t gfp_mask)
{
unsigned long kaddr = (unsigned long)data;
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -431,7 +392,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
- bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op);
while (len) {
struct page *page;
@@ -444,7 +405,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (!page)
goto cleanup;
- if (!reading)
+ if (op_is_write(op))
memcpy(page_address(page), p, bytes);
if (bio_add_page(bio, page, bytes, 0) < bytes)
@@ -454,11 +415,11 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
p += bytes;
}
- if (reading) {
+ if (op_is_write(op)) {
+ bio->bi_end_io = bio_copy_kern_endio;
+ } else {
bio->bi_end_io = bio_copy_kern_endio_read;
bio->bi_private = data;
- } else {
- bio->bi_end_io = bio_copy_kern_endio;
}
return bio;
@@ -556,8 +517,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
if (map_data)
copy = true;
- else if (blk_queue_may_bounce(q))
- copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
else if (iov_iter_is_bvec(iter))
@@ -689,7 +648,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
/**
* blk_rq_map_kern - map kernel data to a request, for passthrough requests
- * @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
@@ -700,31 +658,26 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
-int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
- unsigned int len, gfp_t gfp_mask)
+int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
+ gfp_t gfp_mask)
{
- int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
struct bio *bio;
int ret;
- if (len > (queue_max_hw_sectors(q) << 9))
+ if (len > (queue_max_hw_sectors(rq->q) << SECTOR_SHIFT))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
- if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
- blk_queue_may_bounce(q))
- bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
+ if (!blk_rq_aligned(rq->q, addr, len) || object_is_on_stack(kbuf))
+ bio = bio_copy_kern(kbuf, len, req_op(rq), gfp_mask);
else
- bio = bio_map_kern(q, kbuf, len, gfp_mask);
+ bio = bio_map_kern(kbuf, len, req_op(rq), gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
- bio->bi_opf &= ~REQ_OP_MASK;
- bio->bi_opf |= req_op(rq);
-
ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
bio_uninit(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fdd4efb54c6c..3af1d284add5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -7,7 +7,6 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
-#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/blk-cgroup.h>
@@ -226,27 +225,6 @@ static inline unsigned get_max_io_size(struct bio *bio,
}
/**
- * get_max_segment_size() - maximum number of bytes to add as a single segment
- * @lim: Request queue limits.
- * @paddr: address of the range to add
- * @len: maximum length available to add at @paddr
- *
- * Returns the maximum number of bytes of the range starting at @paddr that can
- * be added to a single segment.
- */
-static inline unsigned get_max_segment_size(const struct queue_limits *lim,
- phys_addr_t paddr, unsigned int len)
-{
- /*
- * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
- * after having calculated the minimum.
- */
- return min_t(unsigned long, len,
- min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
- (unsigned long)lim->max_segment_size - 1) + 1);
-}
-
-/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
* @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
@@ -473,117 +451,6 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
return nr_phys_segs;
}
-struct phys_vec {
- phys_addr_t paddr;
- u32 len;
-};
-
-static bool blk_map_iter_next(struct request *req,
- struct req_iterator *iter, struct phys_vec *vec)
-{
- unsigned int max_size;
- struct bio_vec bv;
-
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- if (!iter->bio)
- return false;
- vec->paddr = bvec_phys(&req->special_vec);
- vec->len = req->special_vec.bv_len;
- iter->bio = NULL;
- return true;
- }
-
- if (!iter->iter.bi_size)
- return false;
-
- bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
- vec->paddr = bvec_phys(&bv);
- max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
- bv.bv_len = min(bv.bv_len, max_size);
- bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
-
- /*
- * If we are entirely done with this bi_io_vec entry, check if the next
- * one could be merged into it. This typically happens when moving to
- * the next bio, but some callers also don't pack bvecs tight.
- */
- while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
- struct bio_vec next;
-
- if (!iter->iter.bi_size) {
- if (!iter->bio->bi_next)
- break;
- iter->bio = iter->bio->bi_next;
- iter->iter = iter->bio->bi_iter;
- }
-
- next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
- if (bv.bv_len + next.bv_len > max_size ||
- !biovec_phys_mergeable(req->q, &bv, &next))
- break;
-
- bv.bv_len += next.bv_len;
- bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
- }
-
- vec->len = bv.bv_len;
- return true;
-}
-
-static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
- struct scatterlist *sglist)
-{
- if (!*sg)
- return sglist;
-
- /*
- * If the driver previously mapped a shorter list, we could see a
- * termination bit prematurely unless it fully inits the sg table
- * on each mapping. We KNOW that there must be more entries here
- * or the driver would be buggy, so force clear the termination bit
- * to avoid doing a full sg_init_table() in drivers for each command.
- */
- sg_unmark_end(*sg);
- return sg_next(*sg);
-}
-
-/*
- * Map a request to scatterlist, return number of sg entries setup. Caller
- * must make sure sg can hold rq->nr_phys_segments entries.
- */
-int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
- struct scatterlist **last_sg)
-{
- struct req_iterator iter = {
- .bio = rq->bio,
- };
- struct phys_vec vec;
- int nsegs = 0;
-
- /* the internal flush request may not have bio attached */
- if (iter.bio)
- iter.iter = iter.bio->bi_iter;
-
- while (blk_map_iter_next(rq, &iter, &vec)) {
- *last_sg = blk_next_sg(last_sg, sglist);
- sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
- offset_in_page(vec.paddr));
- nsegs++;
- }
-
- if (*last_sg)
- sg_mark_end(*last_sg);
-
- /*
- * Something must have been wrong if the figured number of
- * segment is bigger than number of req's physical segments
- */
- WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
-
- return nsegs;
-}
-EXPORT_SYMBOL(__blk_rq_map_sg);
-
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
@@ -832,6 +699,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
+ if (req->bio->bi_write_stream != next->bio->bi_write_stream)
+ return NULL;
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
@@ -953,6 +822,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
+ if (rq->bio->bi_write_stream != bio->bi_write_stream)
+ return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3421b5521fe2..29b3540dd180 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -93,6 +93,8 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
+ QUEUE_FLAG_NAME(DISABLE_WBT_DEF),
+ QUEUE_FLAG_NAME(NO_ELV_SWITCH),
};
#undef QUEUE_FLAG_NAME
@@ -624,20 +626,9 @@ void blk_mq_debugfs_register(struct request_queue *q)
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
- /*
- * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
- * didn't exist yet (because we don't know what to name the directory
- * until the queue is registered to a gendisk).
- */
- if (q->elevator && !q->sched_debugfs_dir)
- blk_mq_debugfs_register_sched(q);
-
- /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->debugfs_dir)
blk_mq_debugfs_register_hctx(q, hctx);
- if (q->elevator && !hctx->sched_debugfs_dir)
- blk_mq_debugfs_register_sched_hctx(q, hctx);
}
if (q->rq_qos) {
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
new file mode 100644
index 000000000000..82bae475dfa4
--- /dev/null
+++ b/block/blk-mq-dma.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Christoph Hellwig
+ */
+#include "blk.h"
+
+struct phys_vec {
+ phys_addr_t paddr;
+ u32 len;
+};
+
+static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
+ struct phys_vec *vec)
+{
+ unsigned int max_size;
+ struct bio_vec bv;
+
+ if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ if (!iter->bio)
+ return false;
+ vec->paddr = bvec_phys(&req->special_vec);
+ vec->len = req->special_vec.bv_len;
+ iter->bio = NULL;
+ return true;
+ }
+
+ if (!iter->iter.bi_size)
+ return false;
+
+ bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ vec->paddr = bvec_phys(&bv);
+ max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+ bv.bv_len = min(bv.bv_len, max_size);
+ bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+
+ /*
+ * If we are entirely done with this bi_io_vec entry, check if the next
+ * one could be merged into it. This typically happens when moving to
+ * the next bio, but some callers also don't pack bvecs tight.
+ */
+ while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
+ struct bio_vec next;
+
+ if (!iter->iter.bi_size) {
+ if (!iter->bio->bi_next)
+ break;
+ iter->bio = iter->bio->bi_next;
+ iter->iter = iter->bio->bi_iter;
+ }
+
+ next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ if (bv.bv_len + next.bv_len > max_size ||
+ !biovec_phys_mergeable(req->q, &bv, &next))
+ break;
+
+ bv.bv_len += next.bv_len;
+ bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ }
+
+ vec->len = bv.bv_len;
+ return true;
+}
+
+static inline struct scatterlist *
+blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
+{
+ if (!*sg)
+ return sglist;
+
+ /*
+ * If the driver previously mapped a shorter list, we could see a
+ * termination bit prematurely unless it fully inits the sg table
+ * on each mapping. We KNOW that there must be more entries here
+ * or the driver would be buggy, so force clear the termination bit
+ * to avoid doing a full sg_init_table() in drivers for each command.
+ */
+ sg_unmark_end(*sg);
+ return sg_next(*sg);
+}
+
+/*
+ * Map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries.
+ */
+int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
+ struct scatterlist **last_sg)
+{
+ struct req_iterator iter = {
+ .bio = rq->bio,
+ };
+ struct phys_vec vec;
+ int nsegs = 0;
+
+ /* the internal flush request may not have bio attached */
+ if (iter.bio)
+ iter.iter = iter.bio->bi_iter;
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ *last_sg = blk_next_sg(last_sg, sglist);
+ sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ nsegs++;
+ }
+
+ if (*last_sg)
+ sg_mark_end(*last_sg);
+
+ /*
+ * Something must have been wrong if the figured number of
+ * segment is bigger than number of req's physical segments
+ */
+ WARN_ON(nsegs > blk_rq_nr_phys_segments(rq));
+
+ return nsegs;
+}
+EXPORT_SYMBOL(__blk_rq_map_sg);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 109611445d40..55a0fd105147 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -59,19 +59,17 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
struct request *rq;
LIST_HEAD(hctx_list);
- unsigned int count = 0;
list_for_each_entry(rq, rq_list, queuelist) {
if (rq->mq_hctx != hctx) {
list_cut_before(&hctx_list, rq_list, &rq->queuelist);
goto dispatch;
}
- count++;
}
list_splice_tail_init(rq_list, &hctx_list);
dispatch:
- return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@@ -167,7 +165,7 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
} while (!list_empty(&rq_list));
} else {
- dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
}
if (busy)
@@ -261,7 +259,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@@ -298,7 +296,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
+ if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
return 0;
need_dispatch = true;
} else {
@@ -312,7 +310,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (need_dispatch)
return blk_mq_do_dispatch_ctx(hctx);
blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, true);
return 0;
}
@@ -436,6 +434,30 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
+void blk_mq_sched_reg_debugfs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched(q);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
+}
+
+void blk_mq_sched_unreg_debugfs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ mutex_lock(&q->debugfs_mutex);
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_debugfs_unregister_sched_hctx(hctx);
+ blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+}
+
/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
@@ -469,10 +491,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err_free_map_and_rqs;
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched(q);
- mutex_unlock(&q->debugfs_mutex);
-
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
@@ -484,11 +502,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return ret;
}
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_register_sched_hctx(q, hctx);
- mutex_unlock(&q->debugfs_mutex);
}
-
return 0;
err_free_map_and_rqs:
@@ -527,10 +541,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched_hctx(hctx);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
@@ -538,12 +548,9 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
flags = hctx->flags;
}
- mutex_lock(&q->debugfs_mutex);
- blk_mq_debugfs_unregister_sched(q);
- mutex_unlock(&q->debugfs_mutex);
-
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
+ set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
q->elevator = NULL;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c2697db59109..4806b867e37d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,7 @@ struct mq_inflight {
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct request *rq, void *priv)
+static bool blk_mq_check_in_driver(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
@@ -101,24 +101,14 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part)
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
-
- return mi.inflight[0] + mi.inflight[1];
-}
-
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2])
-{
- struct mq_inflight mi = { .part = part };
-
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
- inflight[0] = mi.inflight[0];
- inflight[1] = mi.inflight[1];
+ blk_mq_queue_tag_busy_iter(bdev_get_queue(part), blk_mq_check_in_driver,
+ &mi);
+ inflight[READ] = mi.inflight[READ];
+ inflight[WRITE] = mi.inflight[WRITE];
}
#ifdef CONFIG_LOCKDEP
@@ -584,9 +574,13 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = plug->nr_ios,
.cached_rqs = &plug->cached_rqs,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -646,8 +640,13 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
int ret;
@@ -675,8 +674,13 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
+ .shallow_depth = 0,
.cmd_flags = opf,
+ .rq_flags = 0,
.nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
u64 alloc_time_ns = 0;
struct request *rq;
@@ -2080,7 +2084,7 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
* Returns true if we did some work AND can potentially do more.
*/
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
- unsigned int nr_budgets)
+ bool get_budget)
{
enum prep_dispatch prep;
struct request_queue *q = hctx->queue;
@@ -2102,7 +2106,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
WARN_ON_ONCE(hctx != rq->mq_hctx);
- prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ prep = blk_mq_prep_dispatch_rq(rq, get_budget);
if (prep != PREP_DISPATCH_OK)
break;
@@ -2111,12 +2115,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bd.rq = rq;
bd.last = list_empty(list);
- /*
- * once the request is queued to lld, no need to cover the
- * budget any more
- */
- if (nr_budgets)
- nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_STS_OK:
@@ -2150,7 +2148,11 @@ out:
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
- if (nr_budgets)
+ /*
+ * If the caller allocated budgets, free the budgets of the
+ * requests that have not yet been passed to the block driver.
+ */
+ if (!get_budget)
blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
@@ -2778,15 +2780,15 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
return __blk_mq_issue_directly(hctx, rq, last);
}
-static void blk_mq_plug_issue_direct(struct blk_plug *plug)
+static void blk_mq_issue_direct(struct rq_list *rqs)
{
struct blk_mq_hw_ctx *hctx = NULL;
struct request *rq;
int queued = 0;
blk_status_t ret = BLK_STS_OK;
- while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(&plug->mq_list);
+ while ((rq = rq_list_pop(rqs))) {
+ bool last = rq_list_empty(rqs);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2817,15 +2819,64 @@ out:
blk_mq_commit_rqs(hctx, queued, false);
}
-static void __blk_mq_flush_plug_list(struct request_queue *q,
- struct blk_plug *plug)
+static void __blk_mq_flush_list(struct request_queue *q, struct rq_list *rqs)
{
if (blk_queue_quiesced(q))
return;
- q->mq_ops->queue_rqs(&plug->mq_list);
+ q->mq_ops->queue_rqs(rqs);
+}
+
+static unsigned blk_mq_extract_queue_requests(struct rq_list *rqs,
+ struct rq_list *queue_rqs)
+{
+ struct request *rq = rq_list_pop(rqs);
+ struct request_queue *this_q = rq->q;
+ struct request **prev = &rqs->head;
+ struct rq_list matched_rqs = {};
+ struct request *last = NULL;
+ unsigned depth = 1;
+
+ rq_list_add_tail(&matched_rqs, rq);
+ while ((rq = *prev)) {
+ if (rq->q == this_q) {
+ /* move rq from rqs to matched_rqs */
+ *prev = rq->rq_next;
+ rq_list_add_tail(&matched_rqs, rq);
+ depth++;
+ } else {
+ /* leave rq in rqs */
+ prev = &rq->rq_next;
+ last = rq;
+ }
+ }
+
+ rqs->tail = last;
+ *queue_rqs = matched_rqs;
+ return depth;
+}
+
+static void blk_mq_dispatch_queue_requests(struct rq_list *rqs, unsigned depth)
+{
+ struct request_queue *q = rq_list_peek(rqs)->q;
+
+ trace_block_unplug(q, depth, true);
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole list in one go.
+ * We already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ */
+ if (q->mq_ops->queue_rqs) {
+ blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs));
+ if (rq_list_empty(rqs))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q, blk_mq_issue_direct(rqs));
}
-static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+static void blk_mq_dispatch_list(struct rq_list *rqs, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
@@ -2835,7 +2886,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
LIST_HEAD(list);
do {
- struct request *rq = rq_list_pop(&plug->mq_list);
+ struct request *rq = rq_list_pop(rqs);
if (!this_hctx) {
this_hctx = rq->mq_hctx;
@@ -2848,9 +2899,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
}
list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(&plug->mq_list));
+ } while (!rq_list_empty(rqs));
- plug->mq_list = requeue_list;
+ *rqs = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
@@ -2870,9 +2921,21 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
percpu_ref_put(&this_hctx->queue->q_usage_counter);
}
+static void blk_mq_dispatch_multiple_queue_requests(struct rq_list *rqs)
+{
+ do {
+ struct rq_list queue_rqs;
+ unsigned depth;
+
+ depth = blk_mq_extract_queue_requests(rqs, &queue_rqs);
+ blk_mq_dispatch_queue_requests(&queue_rqs, depth);
+ while (!rq_list_empty(&queue_rqs))
+ blk_mq_dispatch_list(&queue_rqs, false);
+ } while (!rq_list_empty(rqs));
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct request *rq;
unsigned int depth;
/*
@@ -2887,34 +2950,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
depth = plug->rq_count;
plug->rq_count = 0;
- if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- struct request_queue *q;
-
- rq = rq_list_peek(&plug->mq_list);
- q = rq->q;
- trace_block_unplug(q, depth, true);
-
- /*
- * Peek first request and see if we have a ->queue_rqs() hook.
- * If we do, we can dispatch the whole plug list in one go. We
- * already know at this point that all requests belong to the
- * same queue, caller must ensure that's the case.
- */
- if (q->mq_ops->queue_rqs) {
- blk_mq_run_dispatch_ops(q,
- __blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(&plug->mq_list))
- return;
+ if (!plug->has_elevator && !from_schedule) {
+ if (plug->multiple_queues) {
+ blk_mq_dispatch_multiple_queue_requests(&plug->mq_list);
+ return;
}
- blk_mq_run_dispatch_ops(q,
- blk_mq_plug_issue_direct(plug));
+ blk_mq_dispatch_queue_requests(&plug->mq_list, depth);
if (rq_list_empty(&plug->mq_list))
return;
}
do {
- blk_mq_dispatch_plug_list(plug, from_schedule);
+ blk_mq_dispatch_list(&plug->mq_list, from_schedule);
} while (!rq_list_empty(&plug->mq_list));
}
@@ -2969,8 +3017,14 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
{
struct blk_mq_alloc_data data = {
.q = q,
- .nr_tags = 1,
+ .flags = 0,
+ .shallow_depth = 0,
.cmd_flags = bio->bi_opf,
+ .rq_flags = 0,
+ .nr_tags = 1,
+ .cached_rqs = NULL,
+ .ctx = NULL,
+ .hctx = NULL
};
struct request *rq;
@@ -3080,8 +3134,6 @@ void blk_mq_submit_bio(struct bio *bio)
goto new_request;
}
- bio = blk_queue_bounce(bio, q);
-
/*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
@@ -4094,8 +4146,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
- mutex_lock(&q->elevator_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -4200,8 +4250,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
-
- mutex_unlock(&q->elevator_lock);
}
/*
@@ -4505,16 +4553,9 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
}
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
- struct request_queue *q, bool lock)
+ struct request_queue *q)
{
- if (lock) {
- /* protect against switching io scheduler */
- mutex_lock(&q->elevator_lock);
- __blk_mq_realloc_hw_ctxs(set, q);
- mutex_unlock(&q->elevator_lock);
- } else {
- __blk_mq_realloc_hw_ctxs(set, q);
- }
+ __blk_mq_realloc_hw_ctxs(set, q);
/* unregister cpuhp callbacks for exited hctxs */
blk_mq_remove_hw_queues_cpuhp(q);
@@ -4546,7 +4587,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
xa_init(&q->hctx_table);
- blk_mq_realloc_hw_ctxs(set, q, false);
+ blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4563,8 +4604,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
- blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
+ blk_mq_add_queue_tag_set(set, q);
return 0;
err_hctxs:
@@ -4784,6 +4825,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
goto out_free_srcu;
}
+ init_rwsem(&set->update_nr_hwq_lock);
+
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues,
sizeof(struct blk_mq_tags *), GFP_KERNEL,
@@ -4923,88 +4966,10 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
-/*
- * request_queue and elevator_type pair.
- * It is just used by __blk_mq_update_nr_hw_queues to cache
- * the elevator_type associated with a request_queue.
- */
-struct blk_mq_qe_pair {
- struct list_head node;
- struct request_queue *q;
- struct elevator_type *type;
-};
-
-/*
- * Cache the elevator_type in qe pair list and switch the
- * io scheduler to 'none'
- */
-static bool blk_mq_elv_switch_none(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
-
- qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
- if (!qe)
- return false;
-
- /* Accessing q->elevator needs protection from ->elevator_lock. */
- mutex_lock(&q->elevator_lock);
-
- if (!q->elevator) {
- kfree(qe);
- goto unlock;
- }
-
- INIT_LIST_HEAD(&qe->node);
- qe->q = q;
- qe->type = q->elevator->type;
- /* keep a reference to the elevator module as we'll switch back */
- __elevator_get(qe->type);
- list_add(&qe->node, head);
- elevator_disable(q);
-unlock:
- mutex_unlock(&q->elevator_lock);
-
- return true;
-}
-
-static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
-
- list_for_each_entry(qe, head, node)
- if (qe->q == q)
- return qe;
-
- return NULL;
-}
-
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
-{
- struct blk_mq_qe_pair *qe;
- struct elevator_type *t;
-
- qe = blk_lookup_qe_pair(head, q);
- if (!qe)
- return;
- t = qe->type;
- list_del(&qe->node);
- kfree(qe);
-
- mutex_lock(&q->elevator_lock);
- elevator_switch(q, t);
- /* drop the reference acquired in blk_mq_elv_switch_none */
- elevator_put(t);
- mutex_unlock(&q->elevator_lock);
-}
-
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
- LIST_HEAD(head);
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
@@ -5019,30 +4984,24 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue_nomemsave(q);
-
- /*
- * Switch IO scheduler to 'none', cleaning up the data associated
- * with the previous scheduler. We will switch back once we are done
- * updating the new sw to hw queue mappings.
- */
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- if (!blk_mq_elv_switch_none(&head, q))
- goto switch_back;
-
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
}
- if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_freeze_queue_nomemsave(q);
+
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_unfreeze_queue_nomemrestore(q);
goto reregister;
+ }
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_realloc_hw_ctxs(set, q, true);
+ __blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -5058,18 +5017,18 @@ fallback:
blk_mq_map_swqueue(q);
}
+ /* elv_update_nr_hw_queues() unfreeze queue for us */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ elv_update_nr_hw_queues(q);
+
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
- }
-
-switch_back:
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_elv_switch_back(&head, q);
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue_nomemrestore(q);
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+ }
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
@@ -5079,9 +5038,11 @@ switch_back:
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
+ down_write(&set->update_nr_hwq_lock);
mutex_lock(&set->tag_list_lock);
__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
mutex_unlock(&set->tag_list_lock);
+ up_write(&set->update_nr_hwq_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3011a78cf16a..affb2e14b56e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -48,7 +48,7 @@ void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
- unsigned int);
+ bool);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -246,10 +246,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
-unsigned int blk_mq_in_flight(struct request_queue *q,
- struct block_device *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
- unsigned int inflight[2]);
+void blk_mq_in_driver_rw(struct block_device *part, unsigned int inflight[2]);
static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
int budget_token)
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 95982bc46ba1..848591fb3c57 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -2,6 +2,8 @@
#include "blk-rq-qos.h"
+__read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
+
/*
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
* false if 'v' + 1 would be bigger than 'below'.
@@ -317,6 +319,7 @@ void rq_qos_exit(struct request_queue *q)
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
+ static_branch_dec(&block_rq_qos);
}
mutex_unlock(&q->rq_qos_mutex);
}
@@ -343,6 +346,7 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
+ static_branch_inc(&block_rq_qos);
blk_mq_unfreeze_queue(q, memflags);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..39749f4066fb 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -12,6 +12,7 @@
#include "blk-mq-debugfs.h"
struct blk_mq_debugfs_attr;
+extern struct static_key_false block_rq_qos;
enum rq_qos_id {
RQ_QOS_WBT,
@@ -112,31 +113,33 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_cleanup(q->rq_qos, bio);
}
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos && !blk_rq_is_passthrough(rq))
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos &&
+ !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_issue(q->rq_qos, rq);
}
static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_requeue(q->rq_qos, rq);
}
static inline void rq_qos_done_bio(struct bio *bio)
{
- if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ if (static_branch_unlikely(&block_rq_qos) &&
+ bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
bio_flagged(bio, BIO_QOS_MERGED))) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (q->rq_qos)
@@ -146,7 +149,7 @@ static inline void rq_qos_done_bio(struct bio *bio)
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- if (q->rq_qos) {
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
@@ -155,14 +158,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_track(q->rq_qos, rq, bio);
}
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos) {
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos) {
bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
}
@@ -170,7 +173,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
{
- if (q->rq_qos)
+ if (static_branch_unlikely(&block_rq_qos) && q->rq_qos)
__rq_qos_queue_depth_changed(q->rq_qos);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4817e7ca03f8..a000daafbfb4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -124,11 +124,6 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
return 0;
}
- if (lim->features & BLK_FEAT_BOUNCE_HIGH) {
- pr_warn("no bounce buffer support for integrity metadata\n");
- return -EINVAL;
- }
-
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
pr_warn("integrity support disabled.\n");
return -EINVAL;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1f9b45b0b9ee..b2b9b89d6967 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -134,6 +134,8 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
+QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
+QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -488,6 +490,8 @@ QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_LIM_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_LIM_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_LIM_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_LIM_RO_ENTRY(queue_max_write_streams, "max_write_streams");
+QUEUE_LIM_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
QUEUE_RW_ENTRY(elv_iosched, "scheduler");
QUEUE_LIM_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@@ -560,7 +564,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
ssize_t ret;
struct request_queue *q = disk->queue;
- mutex_lock(&q->elevator_lock);
+ mutex_lock(&disk->rqos_state_mutex);
if (!wbt_rq_qos(q)) {
ret = -EINVAL;
goto out;
@@ -573,7 +577,7 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
ret = sysfs_emit(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
out:
- mutex_unlock(&q->elevator_lock);
+ mutex_unlock(&disk->rqos_state_mutex);
return ret;
}
@@ -593,7 +597,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
return -EINVAL;
memflags = blk_mq_freeze_queue(q);
- mutex_lock(&q->elevator_lock);
rqos = wbt_rq_qos(q);
if (!rqos) {
@@ -618,11 +621,12 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
*/
blk_mq_quiesce_queue(q);
+ mutex_lock(&disk->rqos_state_mutex);
wbt_set_min_lat(q, val);
+ mutex_unlock(&disk->rqos_state_mutex);
blk_mq_unquiesce_queue(q);
out:
- mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
return ret;
@@ -642,6 +646,8 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
+ &queue_max_write_streams_entry.attr,
+ &queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -869,16 +875,9 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_unregister_ia_ranges;
- mutex_lock(&q->elevator_lock);
- if (q->elevator) {
- ret = elv_register_queue(q, false);
- if (ret) {
- mutex_unlock(&q->elevator_lock);
- goto out_crypto_sysfs_unregister;
- }
- }
+ if (queue_is_mq(q))
+ elevator_set_default(q);
wbt_enable_default(disk);
- mutex_unlock(&q->elevator_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@@ -902,8 +901,6 @@ int blk_register_queue(struct gendisk *disk)
return ret;
-out_crypto_sysfs_unregister:
- blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -951,10 +948,6 @@ void blk_unregister_queue(struct gendisk *disk)
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(disk);
- mutex_lock(&q->elevator_lock);
- elv_unregister_queue(q);
- mutex_unlock(&q->elevator_lock);
-
mutex_lock(&q->sysfs_lock);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
@@ -963,5 +956,8 @@ void blk_unregister_queue(struct gendisk *disk)
kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
kobject_del(&disk->queue_kobj);
+ if (queue_is_mq(q))
+ elevator_set_none(q);
+
blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d6dd2e047874..bd15357f23bd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -143,7 +143,8 @@ static inline unsigned int throtl_bio_data_size(struct bio *bio)
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
- bio_list_init(&qn->bios);
+ bio_list_init(&qn->bios_bps);
+ bio_list_init(&qn->bios_iops);
qn->tg = tg;
}
@@ -151,18 +152,32 @@ static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
- * @queued: the service_queue->queued[] list @qn belongs to
+ * @sq: the service_queue @qn belongs to
*
- * Add @bio to @qn and put @qn on @queued if it's not already on.
+ * Add @bio to @qn and put @qn on @sq->queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
- struct list_head *queued)
+ struct throtl_service_queue *sq)
{
- bio_list_add(&qn->bios, bio);
+ bool rw = bio_data_dir(bio);
+
+ /*
+ * Split bios have already been throttled by bps, so they are
+ * directly queued into the iops path.
+ */
+ if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ bio_list_add(&qn->bios_iops, bio);
+ sq->nr_queued_iops[rw]++;
+ } else {
+ bio_list_add(&qn->bios_bps, bio);
+ sq->nr_queued_bps[rw]++;
+ }
+
if (list_empty(&qn->node)) {
- list_add_tail(&qn->node, queued);
+ list_add_tail(&qn->node, &sq->queued[rw]);
blkg_get(tg_to_blkg(qn->tg));
}
}
@@ -170,6 +185,10 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
+ *
+ * Always take a bio from the head of the iops queue first. If the queue is
+ * empty, we then take it from the bps queue to maintain the overall idea of
+ * fetching bios from the head.
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
@@ -180,28 +199,33 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_peek(&qn->bios);
+ bio = bio_list_peek(&qn->bios_iops);
+ if (!bio)
+ bio = bio_list_peek(&qn->bios_bps);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
- * @queued: the qnode list to pop a bio from
+ * @sq: the service_queue to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
+ * @rw: read/write
*
- * Pop the first bio from the qnode list @queued. After popping, the first
- * qnode is removed from @queued if empty or moved to the end of @queued so
- * that the popping order is round-robin.
+ * Pop the first bio from the qnode list @sq->queued. Note that we firstly
+ * focus on the iops list because bios are ultimately dispatched from it.
+ * After popping, the first qnode is removed from @sq->queued if empty or moved
+ * to the end of @sq->queued so that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
-static struct bio *throtl_pop_queued(struct list_head *queued,
- struct throtl_grp **tg_to_put)
+static struct bio *throtl_pop_queued(struct throtl_service_queue *sq,
+ struct throtl_grp **tg_to_put, bool rw)
{
+ struct list_head *queued = &sq->queued[rw];
struct throtl_qnode *qn;
struct bio *bio;
@@ -209,10 +233,17 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
return NULL;
qn = list_first_entry(queued, struct throtl_qnode, node);
- bio = bio_list_pop(&qn->bios);
+ bio = bio_list_pop(&qn->bios_iops);
+ if (bio) {
+ sq->nr_queued_iops[rw]--;
+ } else {
+ bio = bio_list_pop(&qn->bios_bps);
+ if (bio)
+ sq->nr_queued_bps[rw]--;
+ }
WARN_ON_ONCE(!bio);
- if (bio_list_empty(&qn->bios)) {
+ if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
@@ -520,6 +551,9 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
+ if (!time_before(tg->slice_end[rw], jiffy_end))
+ return;
+
throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
@@ -536,6 +570,11 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
+static unsigned int sq_queued(struct throtl_service_queue *sq, int type)
+{
+ return sq->nr_queued_bps[type] + sq->nr_queued_iops[type];
+}
+
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
@@ -571,6 +610,48 @@ static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
+static long long throtl_trim_bps(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ long long bytes_trim;
+
+ if (bps_limit == U64_MAX)
+ return 0;
+
+ /* Need to consider the case of bytes_allowed overflow. */
+ bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed);
+ if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) {
+ bytes_trim = tg->bytes_disp[rw];
+ tg->bytes_disp[rw] = 0;
+ } else {
+ tg->bytes_disp[rw] -= bytes_trim;
+ }
+
+ return bytes_trim;
+}
+
+static int throtl_trim_iops(struct throtl_grp *tg, bool rw,
+ unsigned long time_elapsed)
+{
+ u32 iops_limit = tg_iops_limit(tg, rw);
+ int io_trim;
+
+ if (iops_limit == UINT_MAX)
+ return 0;
+
+ /* Need to consider the case of io_allowed overflow. */
+ io_trim = calculate_io_allowed(iops_limit, time_elapsed);
+ if (io_trim <= 0 || tg->io_disp[rw] < io_trim) {
+ io_trim = tg->io_disp[rw];
+ tg->io_disp[rw] = 0;
+ } else {
+ tg->io_disp[rw] -= io_trim;
+ }
+
+ return io_trim;
+}
+
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
@@ -612,22 +693,11 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* one extra slice is preserved for deviation.
*/
time_elapsed -= tg->td->throtl_slice;
- bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
- time_elapsed);
- io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed);
- if (bytes_trim <= 0 && io_trim <= 0)
+ bytes_trim = throtl_trim_bps(tg, rw, time_elapsed);
+ io_trim = throtl_trim_iops(tg, rw, time_elapsed);
+ if (!bytes_trim && !io_trim)
return;
- if ((long long)tg->bytes_disp[rw] >= bytes_trim)
- tg->bytes_disp[rw] -= bytes_trim;
- else
- tg->bytes_disp[rw] = 0;
-
- if ((int)tg->io_disp[rw] >= io_trim)
- tg->io_disp[rw] -= io_trim;
- else
- tg->io_disp[rw] = 0;
-
tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
@@ -643,21 +713,41 @@ static void __tg_update_carryover(struct throtl_grp *tg, bool rw,
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
+ long long bytes_allowed;
+ int io_allowed;
+
+ /*
+ * If the queue is empty, carryover handling is not needed. In such cases,
+ * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch
+ * of subsequent bios. The same handling applies when the previous BPS/IOPS
+ * limit was set to max.
+ */
+ if (sq_queued(&tg->service_queue, rw) == 0) {
+ tg->bytes_disp[rw] = 0;
+ tg->io_disp[rw] = 0;
+ return;
+ }
/*
* If config is updated while bios are still throttled, calculate and
- * accumulate how many bytes/ios are waited across changes. And
- * carryover_bytes/ios will be used to calculate new wait time under new
- * configuration.
+ * accumulate how many bytes/ios are waited across changes. And use the
+ * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which
+ * will be used to calculate new wait time under new configuration.
+ * And we need to consider the case of bytes/io_allowed overflow.
*/
- if (bps_limit != U64_MAX)
- *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
- tg->bytes_disp[rw];
- if (iops_limit != UINT_MAX)
- *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) -
- tg->io_disp[rw];
- tg->bytes_disp[rw] -= *bytes;
- tg->io_disp[rw] -= *ios;
+ if (bps_limit != U64_MAX) {
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed);
+ if (bytes_allowed > 0)
+ *bytes = bytes_allowed - tg->bytes_disp[rw];
+ }
+ if (iops_limit != UINT_MAX) {
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed);
+ if (io_allowed > 0)
+ *ios = io_allowed - tg->io_disp[rw];
+ }
+
+ tg->bytes_disp[rw] = -*bytes;
+ tg->io_disp[rw] = -*ios;
}
static void tg_update_carryover(struct throtl_grp *tg)
@@ -665,12 +755,10 @@ static void tg_update_carryover(struct throtl_grp *tg)
long long bytes[2] = {0};
int ios[2] = {0};
- if (tg->service_queue.nr_queued[READ])
- __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
- if (tg->service_queue.nr_queued[WRITE])
- __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
+ __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]);
+ __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]);
- /* see comments in struct throtl_grp for meaning of these fields. */
+ /* see comments in struct throtl_grp for meaning of carryover. */
throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]);
}
@@ -682,10 +770,6 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio
int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
- if (iops_limit == UINT_MAX) {
- return 0;
- }
-
jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */
@@ -711,11 +795,6 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
- /* no need to throttle if this bio's bytes have been accounted */
- if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- return 0;
- }
-
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -724,7 +803,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd);
- if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ /* Need to consider the case of bytes_allowed overflow. */
+ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ || bytes_allowed < 0)
return 0;
/* Calc approx time to dispatch */
@@ -742,17 +823,82 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
return jiffy_wait;
}
+static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ unsigned int bio_size = throtl_bio_data_size(bio);
+
+ /* Charge the bio to the group */
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) {
+ bio_set_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->bytes_disp[bio_data_dir(bio)] += bio_size;
+ }
+}
+
+static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio)
+{
+ bio_clear_flag(bio, BIO_TG_BPS_THROTTLED);
+ tg->io_disp[bio_data_dir(bio)]++;
+}
+
/*
- * Returns whether one can dispatch a bio or not. Also returns approx number
- * of jiffies to wait before this bio is with-in IO rate and can be dispatched
+ * If previous slice expired, start a new one otherwise renew/extend existing
+ * slice to make sure it is at least throtl_slice interval long since now. New
+ * slice is started only for empty throttle group. If there is queued bio, that
+ * means there should be an active slice and it should be extended instead.
*/
-static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static void tg_update_slice(struct throtl_grp *tg, bool rw)
+{
+ if (throtl_slice_used(tg, rw) &&
+ sq_queued(&tg->service_queue, rw) == 0)
+ throtl_start_new_slice(tg, rw, true);
+ else
+ throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice);
+}
+
+static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio)
{
bool rw = bio_data_dir(bio);
- unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
+ unsigned long bps_wait;
+
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING ||
+ bio_flagged(bio, BIO_BPS_THROTTLED) ||
+ bio_flagged(bio, BIO_TG_BPS_THROTTLED))
+ return 0;
+
+ tg_update_slice(tg, rw);
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ throtl_extend_slice(tg, rw, jiffies + bps_wait);
+
+ return bps_wait;
+}
+
+static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
u32 iops_limit = tg_iops_limit(tg, rw);
+ unsigned long iops_wait;
+
+ if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING)
+ return 0;
+
+ tg_update_slice(tg, rw);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ throtl_extend_slice(tg, rw, jiffies + iops_wait);
+
+ return iops_wait;
+}
+
+/*
+ * Returns approx number of jiffies to wait before this bio is with-in IO rate
+ * and can be moved to other queue or dispatched.
+ */
+static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio)
+{
+ bool rw = bio_data_dir(bio);
+ unsigned long wait;
/*
* Currently whole state machine of group depends on first bio
@@ -760,62 +906,20 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* this function with a different bio if there are other bios
* queued.
*/
- BUG_ON(tg->service_queue.nr_queued[rw] &&
+ BUG_ON(sq_queued(&tg->service_queue, rw) &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
- /* If tg->bps = -1, then BW is unlimited */
- if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
- tg->flags & THROTL_TG_CANCELING) {
- if (wait)
- *wait = 0;
- return true;
- }
+ wait = tg_dispatch_bps_time(tg, bio);
+ if (wait != 0)
+ return wait;
/*
- * If previous slice expired, start a new one otherwise renew/extend
- * existing slice to make sure it is at least throtl_slice interval
- * long since now. New slice is started only for empty throttle group.
- * If there is queued bio, that means there should be an active
- * slice and it should be extended instead.
+ * Charge bps here because @bio will be directly placed into the
+ * iops queue afterward.
*/
- if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
- throtl_start_new_slice(tg, rw, true);
- else {
- if (time_before(tg->slice_end[rw],
- jiffies + tg->td->throtl_slice))
- throtl_extend_slice(tg, rw,
- jiffies + tg->td->throtl_slice);
- }
-
- bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
- iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
- if (bps_wait + iops_wait == 0) {
- if (wait)
- *wait = 0;
- return true;
- }
-
- max_wait = max(bps_wait, iops_wait);
-
- if (wait)
- *wait = max_wait;
-
- if (time_before(tg->slice_end[rw], jiffies + max_wait))
- throtl_extend_slice(tg, rw, jiffies + max_wait);
-
- return false;
-}
-
-static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
-{
- bool rw = bio_data_dir(bio);
- unsigned int bio_size = throtl_bio_data_size(bio);
+ throtl_charge_bps_bio(tg, bio);
- /* Charge the bio to the group */
- if (!bio_flagged(bio, BIO_BPS_THROTTLED))
- tg->bytes_disp[rw] += bio_size;
-
- tg->io_disp[rw]++;
+ return tg_dispatch_iops_time(tg, bio);
}
/**
@@ -842,28 +946,36 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* dispatched. Mark that @tg was empty. This is automatically
* cleared on the next tg_update_disptime().
*/
- if (!sq->nr_queued[rw])
+ if (sq_queued(sq, rw) == 0)
tg->flags |= THROTL_TG_WAS_EMPTY;
- throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+ throtl_qnode_add_bio(bio, qn, sq);
+
+ /*
+ * Since we have split the queues, when the iops queue is
+ * previously empty and a new @bio is added into the first @qn,
+ * we also need to update the @tg->disptime.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED) &&
+ bio == throtl_peek_queued(&sq->queued[rw]))
+ tg->flags |= THROTL_TG_IOPS_WAS_EMPTY;
- sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
}
static void tg_update_disptime(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
- unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+ unsigned long read_wait = -1, write_wait = -1, min_wait, disptime;
struct bio *bio;
bio = throtl_peek_queued(&sq->queued[READ]);
if (bio)
- tg_may_dispatch(tg, bio, &read_wait);
+ read_wait = tg_dispatch_time(tg, bio);
bio = throtl_peek_queued(&sq->queued[WRITE]);
if (bio)
- tg_may_dispatch(tg, bio, &write_wait);
+ write_wait = tg_dispatch_time(tg, bio);
min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait;
@@ -875,6 +987,7 @@ static void tg_update_disptime(struct throtl_grp *tg)
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
+ tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY;
}
static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
@@ -901,10 +1014,9 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
- bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
- sq->nr_queued[rw]--;
+ bio = throtl_pop_queued(sq, &tg_to_put, rw);
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -919,7 +1031,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
} else {
bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
- &parent_sq->queued[rw]);
+ parent_sq);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
@@ -941,7 +1053,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
/* Try to dispatch 75% READS and 25% WRITES */
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, READ);
nr_reads++;
@@ -951,7 +1063,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
}
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
- tg_may_dispatch(tg, bio, NULL)) {
+ tg_dispatch_time(tg, bio) == 0) {
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
@@ -984,7 +1096,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
- if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
+ if (sq_queued(sq, READ) || sq_queued(sq, WRITE))
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
@@ -1037,9 +1149,11 @@ again:
dispatched = false;
while (true) {
+ unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ);
+ unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE);
+
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
- sq->nr_queued[READ] + sq->nr_queued[WRITE],
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w);
ret = throtl_select_dispatch(sq);
if (ret) {
@@ -1061,7 +1175,8 @@ again:
if (parent_sq) {
/* @parent_sq is another throl_grp, propagate dispatch */
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
@@ -1101,7 +1216,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
spin_lock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+ while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
spin_unlock_irq(&q->queue_lock);
@@ -1606,11 +1721,30 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
{
- /* throtl is FIFO - if bios are already queued, should queue */
- if (tg->service_queue.nr_queued[rw])
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * For a split bio, we need to specifically distinguish whether the
+ * iops queue is empty.
+ */
+ if (bio_flagged(bio, BIO_BPS_THROTTLED))
+ return sq->nr_queued_iops[rw] == 0 &&
+ tg_dispatch_iops_time(tg, bio) == 0;
+
+ /*
+ * Throtl is FIFO - if bios are already queued, should queue.
+ * If the bps queue is empty and @bio is within the bps limit, charge
+ * bps here for direct placement into the iops queue.
+ */
+ if (sq_queued(&tg->service_queue, rw)) {
+ if (sq->nr_queued_bps[rw] == 0 &&
+ tg_dispatch_bps_time(tg, bio) == 0)
+ throtl_charge_bps_bio(tg, bio);
+
return false;
+ }
- return tg_may_dispatch(tg, bio, NULL);
+ return tg_dispatch_time(tg, bio) == 0;
}
bool __blk_throtl_bio(struct bio *bio)
@@ -1631,7 +1765,7 @@ bool __blk_throtl_bio(struct bio *bio)
while (true) {
if (tg_within_limit(tg, bio, rw)) {
/* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
/*
* We need to trim slice even when bios are not being
@@ -1654,7 +1788,8 @@ bool __blk_throtl_bio(struct bio *bio)
* control algorithm is adaptive, and extra IO bytes
* will be throttled for paying the debt
*/
- throtl_charge_bio(tg, bio);
+ throtl_charge_bps_bio(tg, bio);
+ throtl_charge_iops_bio(tg, bio);
} else {
/* if above limits, break to queue */
break;
@@ -1680,7 +1815,7 @@ bool __blk_throtl_bio(struct bio *bio)
tg->bytes_disp[rw], bio->bi_iter.bi_size,
tg_bps_limit(tg, rw),
tg->io_disp[rw], tg_iops_limit(tg, rw),
- sq->nr_queued[READ], sq->nr_queued[WRITE]);
+ sq_queued(sq, READ), sq_queued(sq, WRITE));
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@@ -1688,11 +1823,13 @@ bool __blk_throtl_bio(struct bio *bio)
/*
* Update @tg's dispatch time and force schedule dispatch if @tg
- * was empty before @bio. The forced scheduling isn't likely to
- * cause undue delay as @bio is likely to be dispatched directly if
- * its @tg's disptime is not in the future.
+ * was empty before @bio, or the iops queue is empty and @bio will
+ * add to. The forced scheduling isn't likely to cause undue
+ * delay as @bio is likely to be dispatched directly if its @tg's
+ * disptime is not in the future.
*/
- if (tg->flags & THROTL_TG_WAS_EMPTY) {
+ if (tg->flags & THROTL_TG_WAS_EMPTY ||
+ tg->flags & THROTL_TG_IOPS_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
}
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index f9f8666891ab..3b27755bfbff 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -29,7 +29,8 @@
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
+ struct bio_list bios_bps; /* queued bios for bps limit */
+ struct bio_list bios_iops; /* queued bios for iops limit */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
@@ -41,7 +42,8 @@ struct throtl_service_queue {
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
+ unsigned int nr_queued_bps[2]; /* number of queued bps bios */
+ unsigned int nr_queued_iops[2]; /* number of queued iops bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
@@ -54,9 +56,14 @@ struct throtl_service_queue {
};
enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
- THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ /*
+ * The sq's iops queue is empty, and a bio is about to be enqueued
+ * to the first qnode's bios_iops list.
+ */
+ THROTL_TG_IOPS_WAS_EMPTY = 1 << 2,
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
struct throtl_grp {
@@ -102,19 +109,16 @@ struct throtl_grp {
/* IOPS limits */
unsigned int iops[2];
- /* Number of bytes dispatched in current slice */
- int64_t bytes_disp[2];
- /* Number of bio's dispatched in current slice */
- int io_disp[2];
-
/*
- * The following two fields are updated when new configuration is
- * submitted while some bios are still throttled, they record how many
- * bytes/ios are waited already in previous configuration, and they will
- * be used to calculate wait time under new configuration.
+ * Number of bytes/bio's dispatched in current slice.
+ * When new configuration is submitted while some bios are still throttled,
+ * first calculate the carryover: the amount of bytes/IOs already waited
+ * under the previous configuration. Then, [bytes/io]_disp are represented
+ * as the negative of the carryover, and they will be used to calculate the
+ * wait time under the new configuration.
*/
- long long carryover_bytes[2];
- int carryover_ios[2];
+ int64_t bytes_disp[2];
+ int io_disp[2];
unsigned long last_check_time;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f1754d07f7e0..a50d4cd55f41 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -37,7 +37,7 @@
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
WBT_READ = 2, /* read */
- WBT_SWAP = 4, /* write, from swap_writepage() */
+ WBT_SWAP = 4, /* write, from swap_writeout() */
WBT_DISCARD = 8, /* discard */
WBT_NR_BITS = 4, /* number of bits */
@@ -704,8 +704,9 @@ void wbt_enable_default(struct gendisk *disk)
struct rq_qos *rqos;
bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
- if (q->elevator &&
- test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
+ mutex_lock(&disk->rqos_state_mutex);
+
+ if (blk_queue_disable_wbt(q))
enable = false;
/* Throttling already enabled? */
@@ -713,8 +714,10 @@ void wbt_enable_default(struct gendisk *disk)
if (rqos) {
if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
+ mutex_unlock(&disk->rqos_state_mutex);
return;
}
+ mutex_unlock(&disk->rqos_state_mutex);
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
@@ -774,11 +777,13 @@ void wbt_disable_default(struct gendisk *disk)
struct rq_wb *rwb;
if (!rqos)
return;
+ mutex_lock(&disk->rqos_state_mutex);
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
+ mutex_unlock(&disk->rqos_state_mutex);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
diff --git a/block/blk.h b/block/blk.h
index 594eeba7b949..37ec459fe656 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -103,8 +103,7 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page);
+ struct page *page, unsigned len, unsigned offset);
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
@@ -322,11 +321,9 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
-void elevator_disable(struct request_queue *q);
-void elevator_exit(struct request_queue *q);
-int elv_register_queue(struct request_queue *q, bool uevent);
-void elv_unregister_queue(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q);
+void elevator_set_default(struct request_queue *q);
+void elevator_set_none(struct request_queue *q);
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
@@ -407,6 +404,27 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
}
}
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @paddr: address of the range to add
+ * @len: maximum length available to add at @paddr
+ *
+ * Returns the maximum number of bytes of the range starting at @paddr that can
+ * be added to a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
+ phys_addr_t paddr, unsigned int len)
+{
+ /*
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
+ */
+ return min_t(unsigned long, len,
+ min(lim->seg_boundary_mask - (lim->seg_boundary_mask & paddr),
+ (unsigned long)lim->max_segment_size - 1) + 1);
+}
+
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -421,7 +439,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
int blk_dev_init(void);
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
-unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
@@ -443,23 +460,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
-struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
-
-static inline bool blk_queue_may_bounce(struct request_queue *q)
-{
- return IS_ENABLED(CONFIG_BOUNCE) &&
- (q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
- max_low_pfn >= max_pfn;
-}
-
-static inline struct bio *blk_queue_bounce(struct bio *bio,
- struct request_queue *q)
-{
- if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
- return __blk_queue_bounce(bio, q);
- return bio;
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
diff --git a/block/bounce.c b/block/bounce.c
deleted file mode 100644
index 09a9616cf209..000000000000
--- a/block/bounce.c
+++ /dev/null
@@ -1,267 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* bounce buffer handling for block devices
- *
- * - Split from highmem.c
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/mm.h>
-#include <linux/export.h>
-#include <linux/swap.h>
-#include <linux/gfp.h>
-#include <linux/bio-integrity.h>
-#include <linux/pagemap.h>
-#include <linux/mempool.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/highmem.h>
-#include <linux/printk.h>
-#include <asm/tlbflush.h>
-
-#include <trace/events/block.h>
-#include "blk.h"
-#include "blk-cgroup.h"
-
-#define POOL_SIZE 64
-#define ISA_POOL_SIZE 16
-
-static struct bio_set bounce_bio_set, bounce_bio_split;
-static mempool_t page_pool;
-
-static void init_bounce_bioset(void)
-{
- static bool bounce_bs_setup;
- int ret;
-
- if (bounce_bs_setup)
- return;
-
- ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- BUG_ON(ret);
-
- ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
- BUG_ON(ret);
- bounce_bs_setup = true;
-}
-
-static __init int init_emergency_pool(void)
-{
- int ret;
-
-#ifndef CONFIG_MEMORY_HOTPLUG
- if (max_pfn <= max_low_pfn)
- return 0;
-#endif
-
- ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0);
- BUG_ON(ret);
- pr_info("pool size: %d pages\n", POOL_SIZE);
-
- init_bounce_bioset();
- return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
- struct bio_vec tovec, fromvec;
- struct bvec_iter iter;
- /*
- * The bio of @from is created by bounce, so we can iterate
- * its bvec from start to end, but the @from->bi_iter can't be
- * trusted because it might be changed by splitting.
- */
- struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
-
- bio_for_each_segment(tovec, to, iter) {
- fromvec = bio_iter_iovec(from, from_iter);
- if (tovec.bv_page != fromvec.bv_page) {
- /*
- * fromvec->bv_offset and fromvec->bv_len might have
- * been modified by the block layer, so use the original
- * copy, bounce_copy_vec already uses tovec->bv_len
- */
- memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
- tovec.bv_offset);
- }
- bio_advance_iter(from, &from_iter, tovec.bv_len);
- }
-}
-
-static void bounce_end_io(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, orig_vec;
- struct bvec_iter orig_iter = bio_orig->bi_iter;
- struct bvec_iter_all iter_all;
-
- /*
- * free up bounce indirect pages used
- */
- bio_for_each_segment_all(bvec, bio, iter_all) {
- orig_vec = bio_iter_iovec(bio_orig, orig_iter);
- if (bvec->bv_page != orig_vec.bv_page) {
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, &page_pool);
- }
- bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
- }
-
- bio_orig->bi_status = bio->bi_status;
- bio_endio(bio_orig);
- bio_put(bio);
-}
-
-static void bounce_end_io_write(struct bio *bio)
-{
- bounce_end_io(bio);
-}
-
-static void bounce_end_io_read(struct bio *bio)
-{
- struct bio *bio_orig = bio->bi_private;
-
- if (!bio->bi_status)
- copy_to_high_bio_irq(bio_orig, bio);
-
- bounce_end_io(bio);
-}
-
-static struct bio *bounce_clone_bio(struct bio *bio_src)
-{
- struct bvec_iter iter;
- struct bio_vec bv;
- struct bio *bio;
-
- /*
- * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
- * bio_src->bi_io_vec to bio->bi_io_vec.
- *
- * We can't do that anymore, because:
- *
- * - The point of cloning the biovec is to produce a bio with a biovec
- * the caller can modify: bi_idx and bi_bvec_done should be 0.
- *
- * - The original bio could've had more than BIO_MAX_VECS biovecs; if
- * we tried to clone the whole thing bio_alloc_bioset() would fail.
- * But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_VECS.
- *
- * - Lastly, bi_vcnt should not be looked at or relied upon by code
- * that does not own the bio - reason being drivers don't use it for
- * iterating over the biovec anymore, so expecting it to be kept up
- * to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work.
- */
- bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
- bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
- if (bio_flagged(bio_src, BIO_REMAPPED))
- bio_set_flag(bio, BIO_REMAPPED);
- bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
- bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
- bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- case REQ_OP_WRITE_ZEROES:
- break;
- default:
- bio_for_each_segment(bv, bio_src, iter)
- bio->bi_io_vec[bio->bi_vcnt++] = bv;
- break;
- }
-
- if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
- goto err_put;
-
- bio_clone_blkg_association(bio, bio_src);
-
- return bio;
-
-err_put:
- bio_put(bio);
- return NULL;
-}
-
-struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
-{
- struct bio *bio;
- int rw = bio_data_dir(bio_orig);
- struct bio_vec *to, from;
- struct bvec_iter iter;
- unsigned i = 0, bytes = 0;
- bool bounce = false;
- int sectors;
-
- bio_for_each_segment(from, bio_orig, iter) {
- if (i++ < BIO_MAX_VECS)
- bytes += from.bv_len;
- if (PageHighMem(from.bv_page))
- bounce = true;
- }
- if (!bounce)
- return bio_orig;
-
- /*
- * Individual bvecs might not be logical block aligned. Round down
- * the split size so that each bio is properly block size aligned,
- * even if we do not use the full hardware limits.
- */
- sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
- SECTOR_SHIFT;
- if (sectors < bio_sectors(bio_orig)) {
- bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, bio_orig);
- submit_bio_noacct(bio_orig);
- bio_orig = bio;
- }
- bio = bounce_clone_bio(bio_orig);
-
- /*
- * Bvec table can't be updated by bio_for_each_segment_all(),
- * so retrieve bvec from the table directly. This way is safe
- * because the 'bio' is single-page bvec.
- */
- for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
- struct page *bounce_page;
-
- if (!PageHighMem(to->bv_page))
- continue;
-
- bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
- inc_zone_page_state(bounce_page, NR_BOUNCE);
-
- if (rw == WRITE) {
- flush_dcache_page(to->bv_page);
- memcpy_from_bvec(page_address(bounce_page), to);
- }
- to->bv_page = bounce_page;
- }
-
- trace_block_bio_bounce(bio_orig);
-
- bio->bi_flags |= (1 << BIO_BOUNCED);
-
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- else
- bio->bi_end_io = bounce_end_io_write;
-
- bio->bi_private = bio_orig;
- return bio;
-}
diff --git a/block/elevator.c b/block/elevator.c
index b4d08026b02c..ab22542e6cf0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -45,6 +45,17 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
+/* Holding context data for changing elevator */
+struct elv_change_ctx {
+ const char *name;
+ bool no_uevent;
+
+ /* for unregistering old elevator */
+ struct elevator_queue *old;
+ /* for registering new elevator */
+ struct elevator_queue *new;
+};
+
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -148,18 +159,18 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void elevator_exit(struct request_queue *q)
+static void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ lockdep_assert_held(&q->elevator_lock);
+
ioc_clear_queue(q);
blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
-
- kobject_put(&e->kobj);
}
static inline void __elv_rqhash_del(struct request *rq)
@@ -412,14 +423,15 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->show)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->show(e, page) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->show(e, page);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -430,14 +442,15 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
{
const struct elv_fs_entry *entry = to_elv(attr);
struct elevator_queue *e;
- ssize_t error;
+ ssize_t error = -ENODEV;
if (!entry->store)
return -EIO;
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->type ? entry->store(e, page, length) : -ENOENT;
+ if (!test_bit(ELEVATOR_FLAG_DYING, &e->flags))
+ error = entry->store(e, page, length);
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -452,13 +465,12 @@ static const struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q, bool uevent)
+static int elv_register_queue(struct request_queue *q,
+ struct elevator_queue *e,
+ bool uevent)
{
- struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->elevator_lock);
-
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
const struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -472,20 +484,25 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
+ /*
+ * Sched is initialized, it is ready to export it via
+ * debugfs
+ */
+ blk_mq_sched_reg_debugfs(q);
set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
-void elv_unregister_queue(struct request_queue *q)
+static void elv_unregister_queue(struct request_queue *q,
+ struct elevator_queue *e)
{
- struct elevator_queue *e = q->elevator;
-
- lockdep_assert_held(&q->elevator_lock);
-
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+
+ /* unexport via debugfs before exiting sched */
+ blk_mq_sched_unreg_debugfs(q);
}
}
@@ -548,159 +565,194 @@ void elv_unregister(struct elevator_type *e)
EXPORT_SYMBOL_GPL(elv_unregister);
/*
- * For single queue devices, default to using mq-deadline. If we have multiple
- * queues or mq-deadline is not available, default to "none".
- */
-static struct elevator_type *elevator_get_default(struct request_queue *q)
-{
- if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
- return NULL;
-
- if (q->nr_hw_queues != 1 &&
- !blk_mq_is_shared_tags(q->tag_set->flags))
- return NULL;
-
- return elevator_find_get("mq-deadline");
-}
-
-/*
- * Use the default elevator settings. If the chosen elevator initialization
- * fails, fall back to the "none" elevator (no elevator).
- */
-void elevator_init_mq(struct request_queue *q)
-{
- struct elevator_type *e;
- unsigned int memflags;
- int err;
-
- WARN_ON_ONCE(blk_queue_registered(q));
-
- if (unlikely(q->elevator))
- return;
-
- e = elevator_get_default(q);
- if (!e)
- return;
-
- /*
- * We are called before adding disk, when there isn't any FS I/O,
- * so freezing queue plus canceling dispatch work is enough to
- * drain any dispatch activities originated from passthrough
- * requests, then no need to quiesce queue which may add long boot
- * latency, especially when lots of disks are involved.
- *
- * Disk isn't added yet, so verifying queue lock only manually.
- */
- memflags = blk_mq_freeze_queue(q);
-
- blk_mq_cancel_work_sync(q);
-
- err = blk_mq_init_sched(q, e);
-
- blk_mq_unfreeze_queue(q, memflags);
-
- if (err) {
- pr_warn("\"%s\" elevator initialization failed, "
- "falling back to \"none\"\n", e->elevator_name);
- }
-
- elevator_put(e);
-}
-
-/*
* Switch to new_e io scheduler.
*
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
-int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
+static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
{
- unsigned int memflags;
- int ret;
+ struct elevator_type *new_e = NULL;
+ int ret = 0;
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
lockdep_assert_held(&q->elevator_lock);
- memflags = blk_mq_freeze_queue(q);
+ if (strncmp(ctx->name, "none", 4)) {
+ new_e = elevator_find_get(ctx->name);
+ if (!new_e)
+ return -EINVAL;
+ }
+
blk_mq_quiesce_queue(q);
if (q->elevator) {
- elv_unregister_queue(q);
+ ctx->old = q->elevator;
elevator_exit(q);
}
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out_unfreeze;
-
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q);
- goto out_unfreeze;
+ if (new_e) {
+ ret = blk_mq_init_sched(q, new_e);
+ if (ret)
+ goto out_unfreeze;
+ ctx->new = q->elevator;
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
}
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ blk_add_trace_msg(q, "elv switch: %s", ctx->name);
out_unfreeze:
blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q, memflags);
if (ret) {
pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
new_e->elevator_name);
}
+ if (new_e)
+ elevator_put(new_e);
return ret;
}
-void elevator_disable(struct request_queue *q)
+static void elv_exit_and_release(struct request_queue *q)
{
- unsigned int memflags;
-
- lockdep_assert_held(&q->elevator_lock);
+ struct elevator_queue *e;
+ unsigned memflags;
memflags = blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
-
- elv_unregister_queue(q);
+ mutex_lock(&q->elevator_lock);
+ e = q->elevator;
elevator_exit(q);
- blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- blk_add_trace_msg(q, "elv switch: none");
-
- blk_mq_unquiesce_queue(q);
+ mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
+ if (e)
+ kobject_put(&e->kobj);
+}
+
+static int elevator_change_done(struct request_queue *q,
+ struct elv_change_ctx *ctx)
+{
+ int ret = 0;
+
+ if (ctx->old) {
+ bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
+ &ctx->old->flags);
+
+ elv_unregister_queue(q, ctx->old);
+ kobject_put(&ctx->old->kobj);
+ if (enable_wbt)
+ wbt_enable_default(q->disk);
+ }
+ if (ctx->new) {
+ ret = elv_register_queue(q, ctx->new, !ctx->no_uevent);
+ if (ret)
+ elv_exit_and_release(q);
+ }
+ return ret;
}
/*
* Switch this queue to the given IO scheduler.
*/
-static int elevator_change(struct request_queue *q, const char *elevator_name)
+static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
- struct elevator_type *e;
- int ret;
+ unsigned int memflags;
+ int ret = 0;
- /* Make sure queue is not in the middle of being removed */
- if (!blk_queue_registered(q))
- return -ENOENT;
+ lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
+
+ memflags = blk_mq_freeze_queue(q);
+ /*
+ * May be called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ *
+ * Disk isn't added yet, so verifying queue lock only manually.
+ */
+ blk_mq_cancel_work_sync(q);
+ mutex_lock(&q->elevator_lock);
+ if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
+ ret = elevator_switch(q, ctx);
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
+ if (!ret)
+ ret = elevator_change_done(q, ctx);
+
+ return ret;
+}
+
+/*
+ * The I/O scheduler depends on the number of hardware queues, this forces a
+ * reattachment when nr_hw_queues changes.
+ */
+void elv_update_nr_hw_queues(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {};
+ int ret = -ENODEV;
+
+ WARN_ON_ONCE(q->mq_freeze_depth == 0);
- if (!strncmp(elevator_name, "none", 4)) {
- if (q->elevator)
- elevator_disable(q);
- return 0;
+ mutex_lock(&q->elevator_lock);
+ if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
+ ctx.name = q->elevator->type->elevator_name;
+
+ /* force to reattach elevator after nr_hw_queue is updated */
+ ret = elevator_switch(q, &ctx);
}
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue_nomemrestore(q);
+ if (!ret)
+ WARN_ON_ONCE(elevator_change_done(q, &ctx));
+}
- if (q->elevator && elevator_match(q->elevator->type, elevator_name))
- return 0;
+/*
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
+ */
+void elevator_set_default(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {
+ .name = "mq-deadline",
+ .no_uevent = true,
+ };
+ int err = 0;
- e = elevator_find_get(elevator_name);
- if (!e)
- return -EINVAL;
- ret = elevator_switch(q, e);
- elevator_put(e);
- return ret;
+ /* now we allow to switch elevator */
+ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
+
+ if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return;
+
+ /*
+ * For single queue devices, default to using mq-deadline. If we
+ * have multiple queues or mq-deadline is not available, default
+ * to "none".
+ */
+ if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags)))
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, "
+ "falling back to \"none\"\n", ctx.name, err);
}
-static void elv_iosched_load_module(char *elevator_name)
+void elevator_set_none(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {
+ .name = "none",
+ };
+ int err;
+
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("%s: set none elevator failed %d\n", __func__, err);
+}
+
+static void elv_iosched_load_module(const char *elevator_name)
{
struct elevator_type *found;
@@ -716,10 +768,14 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
size_t count)
{
char elevator_name[ELV_NAME_MAX];
- char *name;
+ struct elv_change_ctx ctx = {};
int ret;
- unsigned int memflags;
struct request_queue *q = disk->queue;
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ /* Make sure queue is not in the middle of being removed */
+ if (!blk_queue_registered(q))
+ return -ENOENT;
/*
* If the attribute needs to load a module, do it before freezing the
@@ -727,24 +783,25 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
* queue is the one for the device storing the module file.
*/
strscpy(elevator_name, buf, sizeof(elevator_name));
- name = strstrip(elevator_name);
+ ctx.name = strstrip(elevator_name);
- elv_iosched_load_module(name);
+ elv_iosched_load_module(ctx.name);
- memflags = blk_mq_freeze_queue(q);
- mutex_lock(&q->elevator_lock);
- ret = elevator_change(q, name);
- if (!ret)
- ret = count;
- mutex_unlock(&q->elevator_lock);
- blk_mq_unfreeze_queue(q, memflags);
+ down_read(&set->update_nr_hwq_lock);
+ if (!blk_queue_no_elv_switch(q)) {
+ ret = elevator_change(q, &ctx);
+ if (!ret)
+ ret = count;
+ } else {
+ ret = -ENOENT;
+ }
+ up_read(&set->update_nr_hwq_lock);
return ret;
}
ssize_t elv_iosched_show(struct gendisk *disk, char *name)
{
struct request_queue *q = disk->queue;
- struct elevator_queue *eq = q->elevator;
struct elevator_type *cur = NULL, *e;
int len = 0;
@@ -753,7 +810,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
len += sprintf(name+len, "[none] ");
} else {
len += sprintf(name+len, "none ");
- cur = eq->type;
+ cur = q->elevator->type;
}
spin_lock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index e4e44dfac503..a07ce773a38f 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -121,7 +121,8 @@ struct elevator_queue
};
#define ELEVATOR_FLAG_REGISTERED 0
-#define ELEVATOR_FLAG_DISABLE_WBT 1
+#define ELEVATOR_FLAG_DYING 1
+#define ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT 2
/*
* block elevator interface
@@ -182,4 +183,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
+void blk_mq_sched_reg_debugfs(struct request_queue *q);
+void blk_mq_sched_unreg_debugfs(struct request_queue *q);
+
#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c
index 82b672d15ea4..1309861d4c2c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
@@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
@@ -451,12 +474,13 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct folio *folio = NULL;
struct blk_plug plug;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, block_write_full_folio,
- blkdev_get_block);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = block_write_full_folio(folio, wbc, blkdev_get_block);
blk_finish_plug(&plug);
return err;
diff --git a/block/genhd.c b/block/genhd.c
index c2bd86cd09de..8171a6bc3210 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -125,37 +125,46 @@ static void part_stat_read_all(struct block_device *part,
}
}
-unsigned int part_in_flight(struct block_device *part)
+static void bdev_count_inflight_rw(struct block_device *part,
+ unsigned int inflight[2], bool mq_driver)
{
- unsigned int inflight = 0;
int cpu;
- for_each_possible_cpu(cpu) {
- inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
- part_stat_local_read_cpu(part, in_flight[1], cpu);
+ if (mq_driver) {
+ blk_mq_in_driver_rw(part, inflight);
+ } else {
+ for_each_possible_cpu(cpu) {
+ inflight[READ] += part_stat_local_read_cpu(
+ part, in_flight[READ], cpu);
+ inflight[WRITE] += part_stat_local_read_cpu(
+ part, in_flight[WRITE], cpu);
+ }
}
- if ((int)inflight < 0)
- inflight = 0;
- return inflight;
+ if (WARN_ON_ONCE((int)inflight[READ] < 0))
+ inflight[READ] = 0;
+ if (WARN_ON_ONCE((int)inflight[WRITE] < 0))
+ inflight[WRITE] = 0;
}
-static void part_in_flight_rw(struct block_device *part,
- unsigned int inflight[2])
+/**
+ * bdev_count_inflight - get the number of inflight IOs for a block device.
+ *
+ * @part: the block device.
+ *
+ * Inflight here means started IO accounting, from bdev_start_io_acct() for
+ * bio-based block device, and from blk_account_io_start() for rq-based block
+ * device.
+ */
+unsigned int bdev_count_inflight(struct block_device *part)
{
- int cpu;
+ unsigned int inflight[2] = {0};
- inflight[0] = 0;
- inflight[1] = 0;
- for_each_possible_cpu(cpu) {
- inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
- inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
- }
- if ((int)inflight[0] < 0)
- inflight[0] = 0;
- if ((int)inflight[1] < 0)
- inflight[1] = 0;
+ bdev_count_inflight_rw(part, inflight, false);
+
+ return inflight[READ] + inflight[WRITE];
}
+EXPORT_SYMBOL_GPL(bdev_count_inflight);
/*
* Can be deleted altogether. Later.
@@ -389,19 +398,35 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
return ret;
}
-/**
- * add_disk_fwnode - add disk information to kernel list with fwnode
- * @parent: parent device for the disk
- * @disk: per-device partitioning information
- * @groups: Additional per-device sysfs groups
- * @fwnode: attached disk fwnode
- *
- * This function registers the partitioning information in @disk
- * with the kernel. Also attach a fwnode to the disk device.
- */
-int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups,
- struct fwnode_handle *fwnode)
+static void add_disk_final(struct gendisk *disk)
+{
+ struct device *ddev = disk_to_dev(disk);
+
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
+ /* Make sure the first partition scan will be proceed */
+ if (get_capacity(disk) && disk_has_partscan(disk))
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ bdev_add(disk->part0, ddev->devt);
+ if (get_capacity(disk))
+ disk_scan_partitions(disk, BLK_OPEN_READ);
+
+ /*
+ * Announce the disk and partitions after all partitions are
+ * created. (for hidden disks uevents remain suppressed forever)
+ */
+ dev_set_uevent_suppress(ddev, 0);
+ disk_uevent(disk, KOBJ_ADD);
+ }
+
+ blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
+ disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
+}
+
+static int __add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
{
struct device *ddev = disk_to_dev(disk);
@@ -416,12 +441,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
if (disk->fops->submit_bio || disk->fops->poll_bio)
return -EINVAL;
-
- /*
- * Initialize the I/O scheduler code and pick a default one if
- * needed.
- */
- elevator_init_mq(disk->queue);
} else {
if (!disk->fops->submit_bio)
return -EINVAL;
@@ -438,7 +457,7 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
ret = -EINVAL;
if (disk->major) {
if (WARN_ON(!disk->minors))
- goto out_exit_elevator;
+ goto out;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@@ -448,14 +467,14 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
- goto out_exit_elevator;
+ goto out;
} else {
if (WARN_ON(disk->minors))
- goto out_exit_elevator;
+ goto out;
ret = blk_alloc_ext_minor();
if (ret < 0)
- goto out_exit_elevator;
+ goto out;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
}
@@ -516,21 +535,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
&disk->bdi->dev->kobj, "bdi");
if (ret)
goto out_unregister_bdi;
-
- /* Make sure the first partition scan will be proceed */
- if (get_capacity(disk) && disk_has_partscan(disk))
- set_bit(GD_NEED_PART_SCAN, &disk->state);
-
- bdev_add(disk->part0, ddev->devt);
- if (get_capacity(disk))
- disk_scan_partitions(disk, BLK_OPEN_READ);
-
- /*
- * Announce the disk and partitions after all partitions are
- * created. (for hidden disks uevents remain suppressed forever)
- */
- dev_set_uevent_suppress(ddev, 0);
- disk_uevent(disk, KOBJ_ADD);
} else {
/*
* Even if the block_device for a hidden gendisk is not
@@ -539,10 +543,6 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
*/
disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
}
-
- blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
- disk_add_events(disk);
- set_bit(GD_ADDED, &disk->state);
return 0;
out_unregister_bdi:
@@ -564,12 +564,46 @@ out_device_del:
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
-out_exit_elevator:
- if (disk->queue->elevator) {
- mutex_lock(&disk->queue->elevator_lock);
- elevator_exit(disk->queue);
- mutex_unlock(&disk->queue->elevator_lock);
+out:
+ return ret;
+}
+
+/**
+ * add_disk_fwnode - add disk information to kernel list with fwnode
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ * @fwnode: attached disk fwnode
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel. Also attach a fwnode to the disk device.
+ */
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+ int ret;
+
+ if (queue_is_mq(disk->queue)) {
+ set = disk->queue->tag_set;
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ ret = __add_disk(parent, disk, groups, fwnode);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
+ } else {
+ ret = __add_disk(parent, disk, groups, fwnode);
}
+
+ /*
+ * add_disk_final() needn't to read `nr_hw_queues`, so move it out
+ * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
+ * lock dependency on `disk->open_mutex` from scanning partition.
+ */
+ if (!ret)
+ add_disk_final(disk);
return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -652,26 +686,7 @@ void blk_mark_disk_dead(struct gendisk *disk)
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
-/**
- * del_gendisk - remove the gendisk
- * @disk: the struct gendisk to remove
- *
- * Removes the gendisk and all its associated resources. This deletes the
- * partitions associated with the gendisk, and unregisters the associated
- * request_queue.
- *
- * This is the counter to the respective __device_add_disk() call.
- *
- * The final removal of the struct gendisk happens when its refcount reaches 0
- * with put_disk(), which should be called after del_gendisk(), if
- * __device_add_disk() was used.
- *
- * Drivers exist which depend on the release of the gendisk to be synchronous,
- * it should not be deferred.
- *
- * Context: can sleep
- */
-void del_gendisk(struct gendisk *disk)
+static void __del_gendisk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct block_device *part;
@@ -743,14 +758,7 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
- blk_mq_quiesce_queue(q);
- if (q->elevator) {
- mutex_lock(&q->elevator_lock);
- elevator_exit(q);
- mutex_unlock(&q->elevator_lock);
- }
rq_qos_exit(q);
- blk_mq_unquiesce_queue(q);
/*
* If the disk does not own the queue, allow using passthrough requests
@@ -764,6 +772,55 @@ void del_gendisk(struct gendisk *disk)
if (start_drain)
blk_unfreeze_release_lock(q);
}
+
+static void disable_elv_switch(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ WARN_ON_ONCE(!queue_is_mq(q));
+
+ down_write(&set->update_nr_hwq_lock);
+ blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
+ up_write(&set->update_nr_hwq_lock);
+}
+
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ *
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
+ * it should not be deferred.
+ *
+ * Context: can sleep
+ */
+void del_gendisk(struct gendisk *disk)
+{
+ struct blk_mq_tag_set *set;
+ unsigned int memflags;
+
+ if (!queue_is_mq(disk->queue)) {
+ __del_gendisk(disk);
+ } else {
+ set = disk->queue->tag_set;
+
+ disable_elv_switch(disk->queue);
+
+ memflags = memalloc_noio_save();
+ down_read(&set->update_nr_hwq_lock);
+ __del_gendisk(disk);
+ up_read(&set->update_nr_hwq_lock);
+ memalloc_noio_restore(memflags);
+ }
+}
EXPORT_SYMBOL(del_gendisk);
/**
@@ -1005,7 +1062,7 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- inflight = part_in_flight(bdev);
+ inflight = bdev_count_inflight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
@@ -1042,19 +1099,21 @@ ssize_t part_stat_show(struct device *dev,
(unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
}
+/*
+ * Show the number of IOs issued to driver.
+ * For bio-based device, started from bdev_start_io_acct();
+ * For rq-based device, started from blk_mq_start_request();
+ */
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
- unsigned int inflight[2];
+ unsigned int inflight[2] = {0};
- if (queue_is_mq(q))
- blk_mq_in_flight_rw(q, bdev, inflight);
- else
- part_in_flight_rw(bdev, inflight);
+ bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
- return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
+ return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
}
static ssize_t disk_capability_show(struct device *dev,
@@ -1307,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- inflight = part_in_flight(hd);
+ inflight = bdev_count_inflight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
@@ -1422,6 +1481,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
+ mutex_init(&disk->rqos_state_mutex);
return disk;
out_erase_part0:
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 754f6b7415cd..2edf1cac06d5 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -715,7 +715,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
}
/*
- * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
+ * Called from blk_mq_insert_request() or blk_mq_dispatch_list().
*/
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list,
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index 5498a87249d3..e3f1a4852737 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -265,10 +265,6 @@ static int hash_accept(struct socket *sock, struct socket *newsock,
goto out_free_state;
err = crypto_ahash_import(&ctx2->req, state);
- if (err) {
- sock_orphan(sk2);
- sock_put(sk2);
- }
out_free_state:
kfree_sensitive(state);
diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c
index f0dad0c9ce33..cd24ccd20ba6 100644
--- a/drivers/accel/ivpu/ivpu_debugfs.c
+++ b/drivers/accel/ivpu/ivpu_debugfs.c
@@ -455,7 +455,7 @@ priority_bands_fops_write(struct file *file, const char __user *user_buf, size_t
if (ret < 0)
return ret;
- buf[size] = '\0';
+ buf[ret] = '\0';
ret = sscanf(buf, "%u %u %u %u", &band, &grace_period, &process_grace_period,
&process_quantum);
if (ret != 4)
diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index f73ce6e13065..54676e3d82dd 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -231,16 +231,18 @@ static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr,
sizeof(struct acpi_table_pptt));
proc_sz = sizeof(struct acpi_pptt_processor);
- while ((unsigned long)entry + proc_sz < table_end) {
+ /* ignore subtable types that are smaller than a processor node */
+ while ((unsigned long)entry + proc_sz <= table_end) {
cpu_node = (struct acpi_pptt_processor *)entry;
+
if (entry->type == ACPI_PPTT_TYPE_PROCESSOR &&
cpu_node->parent == node_entry)
return 0;
if (entry->length == 0)
return 0;
+
entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry,
entry->length);
-
}
return 1;
}
@@ -273,15 +275,18 @@ static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_he
proc_sz = sizeof(struct acpi_pptt_processor);
/* find the processor structure associated with this cpuid */
- while ((unsigned long)entry + proc_sz < table_end) {
+ while ((unsigned long)entry + proc_sz <= table_end) {
cpu_node = (struct acpi_pptt_processor *)entry;
if (entry->length == 0) {
pr_warn("Invalid zero length subtable\n");
break;
}
+ /* entry->length may not equal proc_sz, revalidate the processor structure length */
if (entry->type == ACPI_PPTT_TYPE_PROCESSOR &&
acpi_cpu_id == cpu_node->acpi_processor_id &&
+ (unsigned long)entry + entry->length <= table_end &&
+ entry->length == proc_sz + cpu_node->number_of_priv_resources * sizeof(u32) &&
acpi_pptt_leaf_node(table_hdr, cpu_node)) {
return (struct acpi_pptt_processor *)entry;
}
diff --git a/drivers/android/binderfs.c b/drivers/android/binderfs.c
index 94c6446604fc..98da8c4eea59 100644
--- a/drivers/android/binderfs.c
+++ b/drivers/android/binderfs.c
@@ -187,7 +187,7 @@ static int binderfs_binder_device_create(struct inode *ref_inode,
inode_lock(d_inode(root));
/* look it up */
- dentry = lookup_one_len(name, root, name_len);
+ dentry = lookup_noperm(&QSTR(name), root);
if (IS_ERR(dentry)) {
inode_unlock(d_inode(root));
ret = PTR_ERR(dentry);
@@ -487,7 +487,7 @@ static struct dentry *binderfs_create_dentry(struct dentry *parent,
{
struct dentry *dentry;
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry))
return dentry;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cd13ef287011..618712071a1e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -468,7 +468,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_PAGETABLE)),
nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
nid, 0UL,
- nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
+ nid, 0UL,
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
nid, K(sreclaimable +
node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e48b24be45ee..0f70e2374e7f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -407,4 +407,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
source "drivers/block/rnbd/Kconfig"
+config BLK_DEV_ZONED_LOOP
+ tristate "Zoned loopback device support"
+ depends on BLK_DEV_ZONED
+ help
+ Saying Y here will allow you to use create a zoned block device using
+ regular files for zones (one file per zones). This is useful to test
+ file systems, device mapper and applications that support zoned block
+ devices. To create a zoned loop device, no user utility is needed, a
+ zoned loop device can be created (or re-started) using a command
+ like:
+
+ echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
+ /dev/zloop-control
+
+ See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
+ details.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1105a2d4fdcb..097707aca725 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 292f127cae0a..b1be6c510372 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -54,32 +54,33 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
+ blk_opf_t opf)
+ __releases(rcu)
+ __acquires(rcu)
{
- pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
- struct page *page;
- int ret = 0;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- return 0;
+ gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
+ struct page *page, *ret;
+ rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
+ rcu_read_lock();
if (!page)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
xa_lock(&brd->brd_pages);
- ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
- if (!ret)
- brd->brd_nr_pages++;
- xa_unlock(&brd->brd_pages);
-
- if (ret < 0) {
+ ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
+ page, gfp);
+ if (ret) {
+ xa_unlock(&brd->brd_pages);
__free_page(page);
- if (ret == -EBUSY)
- ret = 0;
+ if (xa_is_err(ret))
+ return ERR_PTR(xa_err(ret));
+ return ret;
}
- return ret;
+ brd->brd_nr_pages++;
+ xa_unlock(&brd->brd_pages);
+ return page;
}
/*
@@ -100,143 +101,77 @@ static void brd_free_pages(struct brd_device *brd)
}
/*
- * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ * Process a single segment. The segment is capped to not cross page boundaries
+ * in both the bio and the brd backing memory.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
- gfp_t gfp)
-{
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
- int ret;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector, gfp);
- if (ret)
- return ret;
- if (copy < n) {
- sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector, gfp);
- }
- return ret;
-}
-
-/*
- * Copy n bytes from src to the brd starting at sector. Does not sleep.
- */
-static void copy_to_brd(struct brd_device *brd, const void *src,
- sector_t sector, size_t n)
+static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
{
+ struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+ sector_t sector = bio->bi_iter.bi_sector;
+ u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+ blk_opf_t opf = bio->bi_opf;
struct page *page;
- void *dst;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
+ void *kaddr;
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst + offset, src, copy);
- kunmap_atomic(dst);
-
- if (copy < n) {
- src += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(dst);
- }
-}
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
-/*
- * Copy n bytes to dst from the brd starting at sector. Does not sleep.
- */
-static void copy_from_brd(void *dst, struct brd_device *brd,
- sector_t sector, size_t n)
-{
- struct page *page;
- void *src;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
+ rcu_read_lock();
page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src + offset, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
-
- if (copy < n) {
- dst += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
+ if (!page && op_is_write(opf)) {
+ page = brd_insert_page(brd, sector, opf);
+ if (IS_ERR(page))
+ goto out_error;
}
-}
-
-/*
- * Process a single bvec of a bio.
- */
-static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, blk_opf_t opf,
- sector_t sector)
-{
- void *mem;
- int err = 0;
+ kaddr = bvec_kmap_local(&bv);
if (op_is_write(opf)) {
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
-
- err = copy_to_brd_setup(brd, sector, len, gfp);
- if (err)
- goto out;
- }
-
- mem = kmap_atomic(page);
- if (!op_is_write(opf)) {
- copy_from_brd(mem + off, brd, sector, len);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, kaddr, bv.bv_len);
} else {
- flush_dcache_page(page);
- copy_to_brd(brd, mem + off, sector, len);
+ if (page)
+ memcpy_from_page(kaddr, page, offset, bv.bv_len);
+ else
+ memset(kaddr, 0, bv.bv_len);
}
- kunmap_atomic(mem);
+ kunmap_local(kaddr);
+ rcu_read_unlock();
+
+ bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ return true;
+
+out_error:
+ rcu_read_unlock();
+ if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ return false;
+}
-out:
- return err;
+static void brd_free_one_page(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+
+ __free_page(page);
}
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
- sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
+ sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
+ sector_t aligned_end = round_down(
+ sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
struct page *page;
- size -= (aligned_sector - sector) * SECTOR_SIZE;
+ if (aligned_end <= aligned_sector)
+ return;
+
xa_lock(&brd->brd_pages);
- while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
+ while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- __free_page(page);
+ call_rcu(&page->rcu_head, brd_free_one_page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
- size -= PAGE_SIZE;
}
xa_unlock(&brd->brd_pages);
}
@@ -244,36 +179,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
- sector_t sector = bio->bi_iter.bi_sector;
- struct bio_vec bvec;
- struct bvec_iter iter;
if (unlikely(op_is_discard(bio->bi_opf))) {
- brd_do_discard(brd, sector, bio->bi_iter.bi_size);
+ brd_do_discard(brd, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size);
bio_endio(bio);
return;
}
- bio_for_each_segment(bvec, bio, iter) {
- unsigned int len = bvec.bv_len;
- int err;
-
- /* Don't support un-aligned buffer */
- WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
- (len & (SECTOR_SIZE - 1)));
-
- err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio->bi_opf, sector);
- if (err) {
- if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- bio_io_error(bio);
+ do {
+ if (!brd_rw_bvec(brd, bio))
return;
- }
- sector += len >> SECTOR_SHIFT;
- }
+ } while (bio->bi_iter.bi_size);
bio_endio(bio);
}
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b8ba7de08753..e2b1f377f585 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -979,9 +979,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
if (!file)
return -EBADF;
- if ((mode & BLK_OPEN_WRITE) && !file->f_op->write_iter)
- return -EINVAL;
-
error = loop_check_backing_file(file);
if (error)
return error;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 65b96c083b3c..d5cc7bd2875c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
scmd = blk_mq_rq_to_pdu(rq);
if (cgc->buflen) {
- ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+ ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
GFP_NOIO);
if (ret)
goto out;
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 2ee6e9bd4e28..2df8941a6b14 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
- if (bio_add_page(bio, virt_to_page(data), datalen,
- offset_in_page(data)) != datalen) {
- rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
- err = -EINVAL;
- goto bio_put;
- }
+ bio_add_virt_nofail(bio, data, datalen);
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index f9032076bc06..6f51072776f1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -50,6 +50,8 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
+#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -64,7 +66,10 @@
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
| UBLK_F_ZONED \
- | UBLK_F_USER_RECOVERY_FAIL_IO)
+ | UBLK_F_USER_RECOVERY_FAIL_IO \
+ | UBLK_F_UPDATE_SIZE \
+ | UBLK_F_AUTO_BUF_REG \
+ | UBLK_F_QUIESCE)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -77,7 +82,11 @@
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_rq_data {
- struct kref ref;
+ refcount_t ref;
+
+ /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
+ u16 buf_index;
+ void *buf_ctx_handle;
};
struct ublk_uring_cmd_pdu {
@@ -99,6 +108,9 @@ struct ublk_uring_cmd_pdu {
* setup in ublk uring_cmd handler
*/
struct ublk_queue *ubq;
+
+ struct ublk_auto_buf_reg buf;
+
u16 tag;
};
@@ -131,6 +143,14 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/*
+ * request buffer is registered automatically, so we have to unregister it
+ * before completing this request.
+ *
+ * io_uring will unregister buffer automatically for us during exiting.
+ */
+#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
+
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
@@ -140,7 +160,12 @@ struct ublk_io {
unsigned int flags;
int res;
- struct io_uring_cmd *cmd;
+ union {
+ /* valid if UBLK_IO_FLAG_ACTIVE is set */
+ struct io_uring_cmd *cmd;
+ /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
+ struct request *req;
+ };
};
struct ublk_queue {
@@ -198,13 +223,19 @@ struct ublk_params_header {
__u32 types;
};
+static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
const struct ublk_queue *ubq, int tag, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag);
+
+static inline struct ublksrv_io_desc *
+ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
+{
+ return &ubq->io_cmd_buf[tag];
+}
+
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -356,8 +387,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
if (ret)
goto free_req;
- ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
- GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
if (ret)
goto erase_desc;
@@ -477,7 +507,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req);
-static void ublk_complete_rq(struct kref *ref);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -609,6 +638,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
@@ -616,7 +650,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
- return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
+ return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
+ !ublk_support_auto_buf_reg(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -627,8 +662,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
+ *
+ * For auto buffer register, ublk server still may issue
+ * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
+ * so reference is required too.
*/
- return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
+ return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
+ ublk_support_auto_buf_reg(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -637,7 +677,7 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_init(&data->ref);
+ refcount_set(&data->ref, 1);
}
}
@@ -647,7 +687,7 @@ static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- return kref_get_unless_zero(&data->ref);
+ return refcount_inc_not_zero(&data->ref);
}
return true;
@@ -659,7 +699,8 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_put(&data->ref, ublk_complete_rq);
+ if (refcount_dec_and_test(&data->ref))
+ __ublk_complete_rq(req);
} else {
__ublk_complete_rq(req);
}
@@ -695,12 +736,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
return bio_has_data(rq->bio);
}
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag)
-{
- return &ubq->io_cmd_buf[tag];
-}
-
static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
@@ -1117,18 +1152,12 @@ exit:
blk_mq_end_request(req, res);
}
-static void ublk_complete_rq(struct kref *ref)
+static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
+ int res, unsigned issue_flags)
{
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
+ /* read cmd first because req will overwrite it */
+ struct io_uring_cmd *cmd = io->cmd;
- __ublk_complete_rq(req);
-}
-
-static void ubq_complete_io_cmd(struct ublk_io *io, int res,
- unsigned issue_flags)
-{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
@@ -1138,8 +1167,10 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+ io->req = req;
+
/* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, res, 0, issue_flags);
+ io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1154,16 +1185,91 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
+static void ublk_auto_buf_reg_fallback(struct request *req)
+{
+ const struct ublk_queue *ubq = req->mq_hctx->driver_data;
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
+ refcount_set(&data->ref, 1);
+}
+
+static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ int ret;
+
+ ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
+ pdu->buf.index, issue_flags);
+ if (ret) {
+ if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(req);
+ return true;
+ }
+ blk_mq_end_request(req, BLK_STS_IOERR);
+ return false;
+ }
+ /* one extra reference is dropped by ublk_io_release */
+ refcount_set(&data->ref, 2);
+
+ data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
+ /* store buffer index in request payload */
+ data->buf_index = pdu->buf.index;
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ return true;
+}
+
+static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
+ return ublk_auto_buf_reg(req, io, issue_flags);
+
+ ublk_init_req_ref(ubq, req);
+ return true;
+}
+
+static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io)
+{
+ unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+
+ /* partially mapped, update io descriptor */
+ if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+ /*
+ * Nothing mapped, retry until we succeed.
+ *
+ * We may never succeed in mapping any bytes here because
+ * of OOM. TODO: reserve one buffer with single page pinned
+ * for providing forward progress guarantee.
+ */
+ if (unlikely(!mapped_bytes)) {
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q,
+ UBLK_REQUEUE_DELAY_MS);
+ return false;
+ }
+
+ ublk_get_iod(ubq, req->tag)->nr_sectors =
+ mapped_bytes >> 9;
+ }
+
+ return true;
+}
+
static void ublk_dispatch_req(struct ublk_queue *ubq,
struct request *req,
unsigned int issue_flags)
{
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
- unsigned int mapped_bytes;
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+ pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
/*
@@ -1183,54 +1289,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
- * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+ * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
- if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
- io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
- pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
- __func__, io->cmd->cmd_op, ubq->q_id,
- req->tag, io->flags);
- ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
- return;
- }
- /*
- * We have handled UBLK_IO_NEED_GET_DATA command,
- * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
- * do the copy work.
- */
- io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
- pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+ pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
+ __func__, ubq->q_id, req->tag, io->flags);
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
+ issue_flags);
+ return;
}
- mapped_bytes = ublk_map_io(ubq, req, io);
-
- /* partially mapped, update io descriptor */
- if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
- /*
- * Nothing mapped, retry until we succeed.
- *
- * We may never succeed in mapping any bytes here because
- * of OOM. TODO: reserve one buffer with single page pinned
- * for providing forward progress guarantee.
- */
- if (unlikely(!mapped_bytes)) {
- blk_mq_requeue_request(req, false);
- blk_mq_delay_kick_requeue_list(req->q,
- UBLK_REQUEUE_DELAY_MS);
- return;
- }
-
- ublk_get_iod(ubq, req->tag)->nr_sectors =
- mapped_bytes >> 9;
- }
+ if (!ublk_start_io(ubq, req, io))
+ return;
- ublk_init_req_ref(ubq, req);
- ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
+ if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
@@ -1590,30 +1664,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void ublk_commit_completion(struct ublk_device *ub,
- const struct ublksrv_io_cmd *ub_cmd)
-{
- u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
- struct ublk_queue *ubq = ublk_get_queue(ub, qid);
- struct ublk_io *io = &ubq->ios[tag];
- struct request *req;
-
- /* now this cmd slot is owned by nbd driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
-
- /* find the io request and complete */
- req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
- if (WARN_ON_ONCE(unlikely(!req)))
- return;
-
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
-
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
-}
-
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@@ -1642,17 +1692,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
-
- /*
- * Either we fail the request or ublk_rq_task_work_cb
- * will do it
- */
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq))
- __ublk_fail_req(ubq, io, rq);
- }
+ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+ __ublk_fail_req(ubq, io, io->req);
}
}
@@ -1708,7 +1749,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
* that ublk_dispatch_req() is always called
*/
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
- if (req && blk_mq_request_started(req))
+ if (req && blk_mq_request_started(req) && req->tag == tag)
return;
spin_lock(&ubq->cancel_lock);
@@ -1940,6 +1981,20 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (pdu->buf.reserved0 || pdu->buf.reserved1)
+ return -EINVAL;
+
+ if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ return -EINVAL;
+ return 0;
+}
+
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
@@ -1953,16 +2008,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
- const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
if (!req)
return -EINVAL;
@@ -1978,17 +2029,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
+ const struct ublk_queue *ubq,
unsigned int index, unsigned int issue_flags)
{
- const struct ublk_io *io = &ubq->ios[tag];
-
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
@@ -2031,6 +2077,12 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
goto out;
}
+ if (ublk_support_auto_buf_reg(ubq)) {
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ goto out;
+ }
+
ublk_fill_io_cmd(io, cmd, buf_addr);
ublk_mark_io_ready(ub, ubq);
out:
@@ -2038,6 +2090,90 @@ out:
return ret;
}
+static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ const struct ublksrv_io_cmd *ub_cmd,
+ unsigned int issue_flags)
+{
+ struct request *req = io->req;
+
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ return -EINVAL;
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+ /*
+ * User copy requires addr to be unset when command is
+ * not zone append
+ */
+ return -EINVAL;
+ }
+
+ if (ublk_support_auto_buf_reg(ubq)) {
+ int ret;
+
+ /*
+ * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
+ * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
+ * `io_ring_ctx`.
+ *
+ * If this uring_cmd's io_ring_ctx isn't same with the
+ * one for registering the buffer, it is ublk server's
+ * responsibility for unregistering the buffer, otherwise
+ * this ublk request gets stuck.
+ */
+ if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
+ io_buffer_unregister_bvec(cmd, data->buf_index,
+ issue_flags);
+ io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
+ }
+
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ return ret;
+ }
+
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
+
+ /* now this cmd slot is owned by ublk driver */
+ io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+ io->res = ub_cmd->result;
+
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+
+ if (likely(!blk_should_fake_timeout(req->q)))
+ ublk_put_req_ref(ubq, req);
+
+ return 0;
+}
+
+static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
+{
+ struct request *req = io->req;
+
+ /*
+ * We have handled UBLK_IO_NEED_GET_DATA command,
+ * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+ * do the copy work.
+ */
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+ /* update iod->addr because ublksrv may have passed a new io buffer */
+ ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
+ ublk_get_iod(ubq, req->tag)->addr);
+
+ return ublk_start_io(ubq, req, io);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -2048,7 +2184,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret = -EINVAL;
- struct request *req;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@@ -2058,9 +2193,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
- if (!ubq || ub_cmd->q_id != ubq->q_id)
- goto out;
-
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
goto out;
@@ -2075,6 +2207,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
}
+ /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
+ _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
+ goto out;
+
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@@ -2092,45 +2229,23 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
+ return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_FETCH_REQ:
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
if (ret)
goto out;
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
-
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
-
- if (ublk_need_map_io(ubq)) {
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if
- * NEED GET DATA is not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
- req_op(req) == REQ_OP_READ))
- goto out;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
- /*
- * User copy requires addr to be unset when command is
- * not zone append
- */
- ret = -EINVAL;
+ ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
+ if (ret)
goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- ublk_dispatch_req(ubq, req, issue_flags);
- return -EIOCBQUEUED;
+ io->addr = ub_cmd->addr;
+ if (!ublk_get_data(ubq, io))
+ return -EIOCBQUEUED;
+
+ return UBLK_IO_RES_OK;
default:
goto out;
}
@@ -2728,6 +2843,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
return -EINVAL;
}
+ if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
+ pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
+ return -EINVAL;
+ }
+
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@@ -2744,8 +2864,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
* For USER_COPY, we depends on userspace to fill request
* buffer by pwrite() to ublk char device, which can't be
* used for unprivileged device
+ *
+ * Same with zero copy or auto buffer register.
*/
- if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
return -EINVAL;
}
@@ -2803,7 +2926,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
- if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/*
@@ -3106,6 +3230,127 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
return 0;
}
+static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
+{
+ struct ublk_param_basic *p = &ub->params.basic;
+ u64 new_size = header->data[0];
+
+ mutex_lock(&ub->mutex);
+ p->dev_sectors = new_size;
+ set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
+ mutex_unlock(&ub->mutex);
+}
+
+struct count_busy {
+ const struct ublk_queue *ubq;
+ unsigned int nr_busy;
+};
+
+static bool ublk_count_busy_req(struct request *rq, void *data)
+{
+ struct count_busy *idle = data;
+
+ if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
+ idle->nr_busy += 1;
+ return true;
+}
+
+/* uring_cmd is guaranteed to be active if the associated request is idle */
+static bool ubq_has_idle_io(const struct ublk_queue *ubq)
+{
+ struct count_busy data = {
+ .ubq = ubq,
+ };
+
+ blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
+ return data.nr_busy < ubq->q_depth;
+}
+
+/* Wait until each hw queue has at least one idle IO */
+static int ublk_wait_for_idle_io(struct ublk_device *ub,
+ unsigned int timeout_ms)
+{
+ unsigned int elapsed = 0;
+ int ret;
+
+ while (elapsed < timeout_ms && !signal_pending(current)) {
+ unsigned int queues_cancelable = 0;
+ int i;
+
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ queues_cancelable += !!ubq_has_idle_io(ubq);
+ }
+
+ /*
+ * Each queue needs at least one active command for
+ * notifying ublk server
+ */
+ if (queues_cancelable == ub->dev_info.nr_hw_queues)
+ break;
+
+ msleep(UBLK_REQUEUE_DELAY_MS);
+ elapsed += UBLK_REQUEUE_DELAY_MS;
+ }
+
+ if (signal_pending(current))
+ ret = -EINTR;
+ else if (elapsed >= timeout_ms)
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
+ const struct ublksrv_ctrl_cmd *header)
+{
+ /* zero means wait forever */
+ u64 timeout_ms = header->data[0];
+ struct gendisk *disk;
+ int i, ret = -ENODEV;
+
+ if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&ub->mutex);
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ goto unlock;
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto put_disk;
+
+ ret = 0;
+ /* already in expected state */
+ if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+ goto put_disk;
+
+ /* Mark all queues as canceling */
+ blk_mq_quiesce_queue(disk->queue);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ ubq->canceling = true;
+ }
+ blk_mq_unquiesce_queue(disk->queue);
+
+ if (!timeout_ms)
+ timeout_ms = UINT_MAX;
+ ret = ublk_wait_for_idle_io(ub, timeout_ms);
+
+put_disk:
+ ublk_put_disk(disk);
+unlock:
+ mutex_unlock(&ub->mutex);
+
+ /* Cancel pending uring_cmd */
+ if (!ret)
+ ublk_cancel_dev(ub);
+ return ret;
+}
+
/*
* All control commands are sent via /dev/ublk-control, so we have to check
* the destination device's permission
@@ -3191,6 +3436,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_SET_PARAMS:
case UBLK_CMD_START_USER_RECOVERY:
case UBLK_CMD_END_USER_RECOVERY:
+ case UBLK_CMD_UPDATE_SIZE:
+ case UBLK_CMD_QUIESCE_DEV:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -3282,6 +3529,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(ub, header);
break;
+ case UBLK_CMD_UPDATE_SIZE:
+ ublk_ctrl_set_size(ub, header);
+ ret = 0;
+ break;
+ case UBLK_CMD_QUIESCE_DEV:
+ ret = ublk_ctrl_quiesce_dev(ub, header);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -3315,6 +3569,7 @@ static int __init ublk_init(void)
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+ BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
init_waitqueue_head(&ublk_idr_wq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7cffea01d868..30bca8cb7106 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
- err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
+ err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
if (err)
goto out;
@@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
vbr->out_hdr.sector = 0;
- err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+ err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
new file mode 100644
index 000000000000..553b1a713ab9
--- /dev/null
+++ b/drivers/block/zloop.c
@@ -0,0 +1,1385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Christoph Hellwig.
+ * Copyright (c) 2025, Western Digital Corporation or its affiliates.
+ *
+ * Zoned Loop Device driver - exports a zoned block device using one file per
+ * zone as backing storage.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/blkzoned.h>
+#include <linux/pagemap.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+
+/*
+ * Options for adding (and removing) a device.
+ */
+enum {
+ ZLOOP_OPT_ERR = 0,
+ ZLOOP_OPT_ID = (1 << 0),
+ ZLOOP_OPT_CAPACITY = (1 << 1),
+ ZLOOP_OPT_ZONE_SIZE = (1 << 2),
+ ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
+ ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
+ ZLOOP_OPT_BASE_DIR = (1 << 5),
+ ZLOOP_OPT_NR_QUEUES = (1 << 6),
+ ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
+ ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+};
+
+static const match_table_t zloop_opt_tokens = {
+ { ZLOOP_OPT_ID, "id=%d" },
+ { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
+ { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
+ { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
+ { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
+ { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
+ { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
+ { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
+ { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ERR, NULL }
+};
+
+/* Default values for the "add" operation. */
+#define ZLOOP_DEF_ID -1
+#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
+#define ZLOOP_DEF_NR_ZONES 64
+#define ZLOOP_DEF_NR_CONV_ZONES 8
+#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
+#define ZLOOP_DEF_NR_QUEUES 1
+#define ZLOOP_DEF_QUEUE_DEPTH 128
+#define ZLOOP_DEF_BUFFERED_IO false
+
+/* Arbitrary limit on the zone size (16GB). */
+#define ZLOOP_MAX_ZONE_SIZE_MB 16384
+
+struct zloop_options {
+ unsigned int mask;
+ int id;
+ sector_t capacity;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_conv_zones;
+ char *base_dir;
+ unsigned int nr_queues;
+ unsigned int queue_depth;
+ bool buffered_io;
+};
+
+/*
+ * Device states.
+ */
+enum {
+ Zlo_creating = 0,
+ Zlo_live,
+ Zlo_deleting,
+};
+
+enum zloop_zone_flags {
+ ZLOOP_ZONE_CONV = 0,
+ ZLOOP_ZONE_SEQ_ERROR,
+};
+
+struct zloop_zone {
+ struct file *file;
+
+ unsigned long flags;
+ struct mutex lock;
+ enum blk_zone_cond cond;
+ sector_t start;
+ sector_t wp;
+
+ gfp_t old_gfp_mask;
+};
+
+struct zloop_device {
+ unsigned int id;
+ unsigned int state;
+
+ struct blk_mq_tag_set tag_set;
+ struct gendisk *disk;
+
+ struct workqueue_struct *workqueue;
+ bool buffered_io;
+
+ const char *base_dir;
+ struct file *data_dir;
+
+ unsigned int zone_shift;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_zones;
+ unsigned int nr_conv_zones;
+ unsigned int block_size;
+
+ struct zloop_zone zones[] __counted_by(nr_zones);
+};
+
+struct zloop_cmd {
+ struct work_struct work;
+ atomic_t ref;
+ sector_t sector;
+ sector_t nr_sectors;
+ long ret;
+ struct kiocb iocb;
+ struct bio_vec *bvec;
+};
+
+static DEFINE_IDR(zloop_index_idr);
+static DEFINE_MUTEX(zloop_ctl_mutex);
+
+static unsigned int rq_zone_no(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ return blk_rq_pos(rq) >> zlo->zone_shift;
+}
+
+static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ lockdep_assert_held(&zone->lock);
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat (err=%d)\n",
+ zone_no, ret);
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ return ret;
+ }
+
+ file_sectors = stat.size >> SECTOR_SHIFT;
+ if (file_sectors > zlo->zone_capacity) {
+ pr_err("Zone %u file too large (%llu sectors > %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return -EINVAL;
+ }
+
+ if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone %u file size not aligned to block size %u\n",
+ zone_no, zlo->block_size);
+ return -EINVAL;
+ }
+
+ if (!file_sectors) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ } else if (file_sectors == zlo->zone_capacity) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ zone->wp = zone->start + file_sectors;
+ }
+
+ return 0;
+}
+
+static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+ break;
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, 0)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_all_zones(struct zloop_device *zlo)
+{
+ unsigned int i;
+ int ret;
+
+ for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
+ ret = zloop_reset_zone(zlo, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_FULL)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+ unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static void zloop_put_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+ if (!atomic_dec_and_test(&cmd->ref))
+ return;
+ kfree(cmd->bvec);
+ cmd->bvec = NULL;
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_rw_complete(struct kiocb *iocb, long ret)
+{
+ struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
+
+ cmd->ret = ret;
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ sector_t sector = blk_rq_pos(rq);
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+ bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+ int rw = is_write ? ITER_SOURCE : ITER_DEST;
+ struct req_iterator rq_iter;
+ struct zloop_zone *zone;
+ struct iov_iter iter;
+ struct bio_vec tmp;
+ sector_t zone_end;
+ int nr_bvec = 0;
+ int ret;
+
+ atomic_set(&cmd->ref, 2);
+ cmd->sector = sector;
+ cmd->nr_sectors = nr_sectors;
+ cmd->ret = 0;
+
+ /* We should never get an I/O beyond the device capacity. */
+ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
+ ret = -EIO;
+ goto out;
+ }
+ zone = &zlo->zones[zone_no];
+ zone_end = zone->start + zlo->zone_capacity;
+
+ /*
+ * The block layer should never send requests that are not fully
+ * contained within the zone.
+ */
+ if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+ if (ret)
+ goto out;
+ }
+
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+ mutex_lock(&zone->lock);
+
+ if (is_append) {
+ sector = zone->wp;
+ cmd->sector = sector;
+ }
+
+ /*
+ * Write operations must be aligned to the write pointer and
+ * fully contained within the zone capacity.
+ */
+ if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+ zone_no, sector, zone->wp);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /* Implicitly open the target zone. */
+ if (zone->cond == BLK_ZONE_COND_CLOSED ||
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ /*
+ * Advance the write pointer of sequential zones. If the write
+ * fails, the wp position will be corrected when the next I/O
+ * copmpletes.
+ */
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end)
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+
+ rq_for_each_bvec(tmp, rq, rq_iter)
+ nr_bvec++;
+
+ if (rq->bio != rq->biotail) {
+ struct bio_vec *bvec;
+
+ cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
+ if (!cmd->bvec) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /*
+ * The bios of the request may be started from the middle of
+ * the 'bvec' because of bio splitting, so we can't directly
+ * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+ * API will take care of all details for us.
+ */
+ bvec = cmd->bvec;
+ rq_for_each_bvec(tmp, rq, rq_iter) {
+ *bvec = tmp;
+ bvec++;
+ }
+ iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+ } else {
+ /*
+ * Same here, this bio may be started from the middle of the
+ * 'bvec' because of bio splitting, so offset from the bvec
+ * must be passed to iov iterator
+ */
+ iov_iter_bvec(&iter, rw,
+ __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+ nr_bvec, blk_rq_bytes(rq));
+ iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
+ }
+
+ cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+ cmd->iocb.ki_filp = zone->file;
+ cmd->iocb.ki_complete = zloop_rw_complete;
+ if (!zlo->buffered_io)
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+ cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+ if (rw == ITER_SOURCE)
+ ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
+unlock:
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+ mutex_unlock(&zone->lock);
+out:
+ if (ret != -EIOCBQUEUED)
+ zloop_rw_complete(&cmd->iocb, ret);
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_handle_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * zloop_rw() always executes asynchronously or completes
+ * directly.
+ */
+ zloop_rw(cmd);
+ return;
+ case REQ_OP_FLUSH:
+ /*
+ * Sync the entire FS containing the zone files instead of
+ * walking all files
+ */
+ cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
+ break;
+ case REQ_OP_ZONE_RESET:
+ cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ cmd->ret = zloop_reset_all_zones(zlo);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_OPEN:
+ cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ pr_err("Unsupported operation %d\n", req_op(rq));
+ cmd->ret = -EOPNOTSUPP;
+ break;
+ }
+
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_cmd_workfn(struct work_struct *work)
+{
+ struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
+ int orig_flags = current->flags;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ zloop_handle_cmd(cmd);
+ current->flags = orig_flags;
+}
+
+static void zloop_complete_rq(struct request *rq)
+{
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = cmd->sector >> zlo->zone_shift;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ blk_status_t sts = BLK_STS_OK;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
+ zone_no, cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ /* short read */
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ }
+ break;
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
+ zone_no,
+ req_op(rq) == REQ_OP_WRITE ? "" : "append ",
+ cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ pr_err("Zone %u: partial write %ld/%u B\n",
+ zone_no, cmd->ret, blk_rq_bytes(rq));
+ cmd->ret = -EIO;
+ }
+
+ if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ /*
+ * A write to a sequential zone file failed: mark the
+ * zone as having an error. This will be corrected and
+ * cleared when the next IO is submitted.
+ */
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ break;
+ }
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ rq->__sector = cmd->sector;
+
+ break;
+ default:
+ break;
+ }
+
+ if (cmd->ret < 0)
+ sts = errno_to_blk_status(cmd->ret);
+ blk_mq_end_request(rq, sts);
+}
+
+static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ if (zlo->state == Zlo_deleting)
+ return BLK_STS_IOERR;
+
+ blk_mq_start_request(rq);
+
+ INIT_WORK(&cmd->work, zloop_cmd_workfn);
+ queue_work(zlo->workqueue, &cmd->work);
+
+ return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops zloop_mq_ops = {
+ .queue_rq = zloop_queue_rq,
+ .complete = zloop_complete_rq,
+};
+
+static int zloop_open(struct gendisk *disk, blk_mode_t mode)
+{
+ struct zloop_device *zlo = disk->private_data;
+ int ret;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ if (zlo->state != Zlo_live)
+ ret = -ENXIO;
+ mutex_unlock(&zloop_ctl_mutex);
+ return ret;
+}
+
+static int zloop_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct zloop_device *zlo = disk->private_data;
+ struct blk_zone blkz = {};
+ unsigned int first, i;
+ int ret;
+
+ first = disk_zone_no(disk, sector);
+ if (first >= zlo->nr_zones)
+ return 0;
+ nr_zones = min(nr_zones, zlo->nr_zones - first);
+
+ for (i = 0; i < nr_zones; i++) {
+ unsigned int zone_no = first + i;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret) {
+ mutex_unlock(&zone->lock);
+ return ret;
+ }
+ }
+
+ blkz.start = zone->start;
+ blkz.len = zlo->zone_size;
+ blkz.wp = zone->wp;
+ blkz.cond = zone->cond;
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ blkz.capacity = zlo->zone_size;
+ } else {
+ blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ blkz.capacity = zlo->zone_capacity;
+ }
+
+ mutex_unlock(&zone->lock);
+
+ ret = cb(&blkz, i, data);
+ if (ret)
+ return ret;
+ }
+
+ return nr_zones;
+}
+
+static void zloop_free_disk(struct gendisk *disk)
+{
+ struct zloop_device *zlo = disk->private_data;
+ unsigned int i;
+
+ for (i = 0; i < zlo->nr_zones; i++) {
+ struct zloop_zone *zone = &zlo->zones[i];
+
+ mapping_set_gfp_mask(zone->file->f_mapping,
+ zone->old_gfp_mask);
+ fput(zone->file);
+ }
+
+ fput(zlo->data_dir);
+ destroy_workqueue(zlo->workqueue);
+ kfree(zlo->base_dir);
+ kvfree(zlo);
+}
+
+static const struct block_device_operations zloop_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_open,
+ .report_zones = zloop_report_zones,
+ .free_disk = zloop_free_disk,
+};
+
+__printf(3, 4)
+static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
+ const char *fmt, ...)
+{
+ struct file *file;
+ va_list ap;
+ char *p;
+
+ va_start(ap, fmt);
+ p = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ file = filp_open(p, oflags, mode);
+ kfree(p);
+ return file;
+}
+
+static int zloop_get_block_size(struct zloop_device *zlo,
+ struct zloop_zone *zone)
+{
+ struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
+ struct kstat st;
+
+ /*
+ * If the FS block size is lower than or equal to 4K, use that as the
+ * device block size. Otherwise, fallback to the FS direct IO alignment
+ * constraint if that is provided, and to the FS underlying device
+ * physical block size if the direct IO alignment is unknown.
+ */
+ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+ zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
+ else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN))
+ zlo->block_size = st.dio_offset_align;
+ else if (sb_bdev)
+ zlo->block_size = bdev_physical_block_size(sb_bdev);
+ else
+ zlo->block_size = SECTOR_SIZE;
+
+ if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone capacity is not aligned to block size %u\n",
+ zlo->block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
+ unsigned int zone_no, bool restore)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int oflags = O_RDWR;
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ mutex_init(&zone->lock);
+ zone->start = (sector_t)zone_no << zlo->zone_shift;
+
+ if (!restore)
+ oflags |= O_CREAT;
+
+ if (!opts->buffered_io)
+ oflags |= O_DIRECT;
+
+ if (zone_no < zlo->nr_conv_zones) {
+ /* Conventional zone file. */
+ set_bit(ZLOOP_ZONE_CONV, &zone->flags);
+ zone->cond = BLK_ZONE_COND_NOT_WP;
+ zone->wp = U64_MAX;
+
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat\n", zone_no);
+ return ret;
+ }
+ file_sectors = stat.size >> SECTOR_SHIFT;
+
+ if (restore && file_sectors != zlo->zone_size) {
+ pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return ret;
+ }
+
+ ret = vfs_truncate(&zone->file->f_path,
+ zlo->zone_size << SECTOR_SHIFT);
+ if (ret < 0) {
+ pr_err("Failed to truncate zone %u file (err=%d)\n",
+ zone_no, ret);
+ return ret;
+ }
+
+ return 0;
+ }
+
+ /* Sequential zone file. */
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ zloop_get_block_size(zlo, zone);
+
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static bool zloop_dev_exists(struct zloop_device *zlo)
+{
+ struct file *cnv, *seq;
+ bool exists;
+
+ cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, 0);
+ seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, 0);
+ exists = !IS_ERR(cnv) || !IS_ERR(seq);
+
+ if (!IS_ERR(cnv))
+ fput(cnv);
+ if (!IS_ERR(seq))
+ fput(seq);
+
+ return exists;
+}
+
+static int zloop_ctl_add(struct zloop_options *opts)
+{
+ struct queue_limits lim = {
+ .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
+ .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
+ .chunk_sectors = opts->zone_size,
+ .features = BLK_FEAT_ZONED,
+ };
+ unsigned int nr_zones, i, j;
+ struct zloop_device *zlo;
+ int ret = -EINVAL;
+ bool restore;
+
+ __module_get(THIS_MODULE);
+
+ nr_zones = opts->capacity >> ilog2(opts->zone_size);
+ if (opts->nr_conv_zones >= nr_zones) {
+ pr_err("Invalid number of conventional zones %u\n",
+ opts->nr_conv_zones);
+ goto out;
+ }
+
+ zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
+ if (!zlo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ zlo->state = Zlo_creating;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ goto out_free_dev;
+
+ /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
+ if (opts->id >= 0) {
+ ret = idr_alloc(&zloop_index_idr, zlo,
+ opts->id, opts->id + 1, GFP_KERNEL);
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ } else {
+ ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
+ }
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret < 0)
+ goto out_free_dev;
+
+ zlo->id = ret;
+ zlo->zone_shift = ilog2(opts->zone_size);
+ zlo->zone_size = opts->zone_size;
+ if (opts->zone_capacity)
+ zlo->zone_capacity = opts->zone_capacity;
+ else
+ zlo->zone_capacity = zlo->zone_size;
+ zlo->nr_zones = nr_zones;
+ zlo->nr_conv_zones = opts->nr_conv_zones;
+ zlo->buffered_io = opts->buffered_io;
+
+ zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
+ opts->nr_queues * opts->queue_depth, zlo->id);
+ if (!zlo->workqueue) {
+ ret = -ENOMEM;
+ goto out_free_idr;
+ }
+
+ if (opts->base_dir)
+ zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
+ else
+ zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
+ if (!zlo->base_dir) {
+ ret = -ENOMEM;
+ goto out_destroy_workqueue;
+ }
+
+ zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
+ zlo->base_dir, zlo->id);
+ if (IS_ERR(zlo->data_dir)) {
+ ret = PTR_ERR(zlo->data_dir);
+ pr_warn("Failed to open directory %s/%u (err=%d)\n",
+ zlo->base_dir, zlo->id, ret);
+ goto out_free_base_dir;
+ }
+
+ /*
+ * If we already have zone files, we are restoring a device created by a
+ * previous add operation. In this case, zloop_init_zone() will check
+ * that the zone files are consistent with the zone configuration given.
+ */
+ restore = zloop_dev_exists(zlo);
+ for (i = 0; i < nr_zones; i++) {
+ ret = zloop_init_zone(zlo, opts, i, restore);
+ if (ret)
+ goto out_close_files;
+ }
+
+ lim.physical_block_size = zlo->block_size;
+ lim.logical_block_size = zlo->block_size;
+
+ zlo->tag_set.ops = &zloop_mq_ops;
+ zlo->tag_set.nr_hw_queues = opts->nr_queues;
+ zlo->tag_set.queue_depth = opts->queue_depth;
+ zlo->tag_set.numa_node = NUMA_NO_NODE;
+ zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
+ zlo->tag_set.driver_data = zlo;
+
+ ret = blk_mq_alloc_tag_set(&zlo->tag_set);
+ if (ret) {
+ pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
+ goto out_close_files;
+ }
+
+ zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
+ if (IS_ERR(zlo->disk)) {
+ pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
+ ret = PTR_ERR(zlo->disk);
+ goto out_cleanup_tags;
+ }
+ zlo->disk->flags = GENHD_FL_NO_PART;
+ zlo->disk->fops = &zloop_fops;
+ zlo->disk->private_data = zlo;
+ sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
+ set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
+
+ ret = blk_revalidate_disk_zones(zlo->disk);
+ if (ret)
+ goto out_cleanup_disk;
+
+ ret = add_disk(zlo->disk);
+ if (ret) {
+ pr_err("add_disk failed (err=%d)\n", ret);
+ goto out_cleanup_disk;
+ }
+
+ mutex_lock(&zloop_ctl_mutex);
+ zlo->state = Zlo_live;
+ mutex_unlock(&zloop_ctl_mutex);
+
+ pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ zlo->id, zlo->nr_zones,
+ ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
+ zlo->block_size);
+
+ return 0;
+
+out_cleanup_disk:
+ put_disk(zlo->disk);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&zlo->tag_set);
+out_close_files:
+ for (j = 0; j < i; j++) {
+ struct zloop_zone *zone = &zlo->zones[j];
+
+ if (!IS_ERR_OR_NULL(zone->file))
+ fput(zone->file);
+ }
+ fput(zlo->data_dir);
+out_free_base_dir:
+ kfree(zlo->base_dir);
+out_destroy_workqueue:
+ destroy_workqueue(zlo->workqueue);
+out_free_idr:
+ mutex_lock(&zloop_ctl_mutex);
+ idr_remove(&zloop_index_idr, zlo->id);
+ mutex_unlock(&zloop_ctl_mutex);
+out_free_dev:
+ kvfree(zlo);
+out:
+ module_put(THIS_MODULE);
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ret;
+}
+
+static int zloop_ctl_remove(struct zloop_options *opts)
+{
+ struct zloop_device *zlo;
+ int ret;
+
+ if (!(opts->mask & ZLOOP_OPT_ID)) {
+ pr_err("No ID specified\n");
+ return -EINVAL;
+ }
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ zlo = idr_find(&zloop_index_idr, opts->id);
+ if (!zlo || zlo->state == Zlo_creating) {
+ ret = -ENODEV;
+ } else if (zlo->state == Zlo_deleting) {
+ ret = -EINVAL;
+ } else {
+ idr_remove(&zloop_index_idr, zlo->id);
+ zlo->state = Zlo_deleting;
+ }
+
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ del_gendisk(zlo->disk);
+ put_disk(zlo->disk);
+ blk_mq_free_tag_set(&zlo->tag_set);
+
+ pr_info("Removed device %d\n", opts->id);
+
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int zloop_parse_options(struct zloop_options *opts, const char *buf)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *options, *o, *p;
+ unsigned int token;
+ int ret = 0;
+
+ /* Set defaults. */
+ opts->mask = 0;
+ opts->id = ZLOOP_DEF_ID;
+ opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
+ opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
+ opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+ opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
+ opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
+ opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+
+ if (!buf)
+ return 0;
+
+ /* Skip leading spaces before the options. */
+ while (isspace(*buf))
+ buf++;
+
+ options = o = kstrdup(buf, GFP_KERNEL);
+ if (!options)
+ return -ENOMEM;
+
+ /* Parse the options, doing only some light invalid value checks. */
+ while ((p = strsep(&o, ",\n")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, zloop_opt_tokens, args);
+ opts->mask |= token;
+ switch (token) {
+ case ZLOOP_OPT_ID:
+ if (match_int(args, &opts->id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case ZLOOP_OPT_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_SIZE:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
+ !is_power_of_2(token)) {
+ pr_err("Invalid zone size %u\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_size =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid zone capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_NR_CONV_ZONES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_conv_zones = token;
+ break;
+ case ZLOOP_OPT_BASE_DIR:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kfree(opts->base_dir);
+ opts->base_dir = p;
+ break;
+ case ZLOOP_OPT_NR_QUEUES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid number of queues\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_queues = min(token, num_online_cpus());
+ break;
+ case ZLOOP_OPT_QUEUE_DEPTH:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid queue depth\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->queue_depth = token;
+ break;
+ case ZLOOP_OPT_BUFFERED_IO:
+ opts->buffered_io = true;
+ break;
+ case ZLOOP_OPT_ERR:
+ default:
+ pr_warn("unknown parameter or missing value '%s'\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = -EINVAL;
+ if (opts->capacity <= opts->zone_size) {
+ pr_err("Invalid capacity\n");
+ goto out;
+ }
+
+ if (opts->zone_capacity > opts->zone_size) {
+ pr_err("Invalid zone capacity\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(options);
+ return ret;
+}
+
+enum {
+ ZLOOP_CTL_ADD,
+ ZLOOP_CTL_REMOVE,
+};
+
+static struct zloop_ctl_op {
+ int code;
+ const char *name;
+} zloop_ctl_ops[] = {
+ { ZLOOP_CTL_ADD, "add" },
+ { ZLOOP_CTL_REMOVE, "remove" },
+ { -1, NULL },
+};
+
+static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+ struct zloop_options opts = { };
+ struct zloop_ctl_op *op;
+ const char *buf, *opts_buf;
+ int i, ret;
+
+ if (count > PAGE_SIZE)
+ return -ENOMEM;
+
+ buf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
+ op = &zloop_ctl_ops[i];
+ if (!op->name) {
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!strncmp(buf, op->name, strlen(op->name)))
+ break;
+ }
+
+ if (count <= strlen(op->name))
+ opts_buf = NULL;
+ else
+ opts_buf = buf + strlen(op->name);
+
+ ret = zloop_parse_options(&opts, opts_buf);
+ if (ret) {
+ pr_err("Failed to parse options\n");
+ goto out;
+ }
+
+ switch (op->code) {
+ case ZLOOP_CTL_ADD:
+ ret = zloop_ctl_add(&opts);
+ break;
+ case ZLOOP_CTL_REMOVE:
+ ret = zloop_ctl_remove(&opts);
+ break;
+ default:
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ kfree(opts.base_dir);
+ kfree(buf);
+ return ret ? ret : count;
+}
+
+static int zloop_ctl_show(struct seq_file *seq_file, void *private)
+{
+ const struct match_token *tok;
+ int i;
+
+ /* Add operation */
+ seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
+ for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
+ tok = &zloop_opt_tokens[i];
+ if (!tok->pattern)
+ break;
+ if (i)
+ seq_putc(seq_file, ',');
+ seq_puts(seq_file, tok->pattern);
+ }
+ seq_putc(seq_file, '\n');
+
+ /* Remove operation */
+ seq_puts(seq_file, zloop_ctl_ops[1].name);
+ seq_puts(seq_file, " id=%d\n");
+
+ return 0;
+}
+
+static int zloop_ctl_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return single_open(file, zloop_ctl_show, NULL);
+}
+
+static int zloop_ctl_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static const struct file_operations zloop_ctl_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_ctl_open,
+ .release = zloop_ctl_release,
+ .write = zloop_ctl_write,
+ .read = seq_read,
+};
+
+static struct miscdevice zloop_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "zloop-control",
+ .fops = &zloop_ctl_fops,
+};
+
+static int __init zloop_init(void)
+{
+ int ret;
+
+ ret = misc_register(&zloop_misc);
+ if (ret) {
+ pr_err("Failed to register misc device: %d\n", ret);
+ return ret;
+ }
+ pr_info("Module loaded\n");
+
+ return 0;
+}
+
+static void __exit zloop_exit(void)
+{
+ misc_deregister(&zloop_misc);
+ idr_destroy(&zloop_index_idr);
+}
+
+module_init(zloop_init);
+module_exit(zloop_exit);
+
+MODULE_DESCRIPTION("Zoned loopback device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index a42dedb78e0a..256b451bbe06 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -3014,9 +3014,8 @@ static void btusb_coredump_qca(struct hci_dev *hdev)
static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb)
{
int ret = 0;
+ unsigned int skip = 0;
u8 pkt_type;
- u8 *sk_ptr;
- unsigned int sk_len;
u16 seqno;
u32 dump_size;
@@ -3025,18 +3024,13 @@ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb)
struct usb_device *udev = btdata->udev;
pkt_type = hci_skb_pkt_type(skb);
- sk_ptr = skb->data;
- sk_len = skb->len;
+ skip = sizeof(struct hci_event_hdr);
+ if (pkt_type == HCI_ACLDATA_PKT)
+ skip += sizeof(struct hci_acl_hdr);
- if (pkt_type == HCI_ACLDATA_PKT) {
- sk_ptr += HCI_ACL_HDR_SIZE;
- sk_len -= HCI_ACL_HDR_SIZE;
- }
-
- sk_ptr += HCI_EVENT_HDR_SIZE;
- sk_len -= HCI_EVENT_HDR_SIZE;
+ skb_pull(skb, skip);
+ dump_hdr = (struct qca_dump_hdr *)skb->data;
- dump_hdr = (struct qca_dump_hdr *)sk_ptr;
seqno = le16_to_cpu(dump_hdr->seqno);
if (seqno == 0) {
set_bit(BTUSB_HW_SSR_ACTIVE, &btdata->flags);
@@ -3056,16 +3050,15 @@ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb)
btdata->qca_dump.ram_dump_size = dump_size;
btdata->qca_dump.ram_dump_seqno = 0;
- sk_ptr += offsetof(struct qca_dump_hdr, data0);
- sk_len -= offsetof(struct qca_dump_hdr, data0);
+
+ skb_pull(skb, offsetof(struct qca_dump_hdr, data0));
usb_disable_autosuspend(udev);
bt_dev_info(hdev, "%s memdump size(%u)\n",
(pkt_type == HCI_ACLDATA_PKT) ? "ACL" : "event",
dump_size);
} else {
- sk_ptr += offsetof(struct qca_dump_hdr, data);
- sk_len -= offsetof(struct qca_dump_hdr, data);
+ skb_pull(skb, offsetof(struct qca_dump_hdr, data));
}
if (!btdata->qca_dump.ram_dump_size) {
@@ -3085,7 +3078,6 @@ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb)
return ret;
}
- skb_pull(skb, skb->len - sk_len);
hci_devcd_append(hdev, skb);
btdata->qca_dump.ram_dump_seqno++;
if (seqno == QCA_LAST_SEQUENCE_NUM) {
@@ -3113,68 +3105,58 @@ out:
/* Return: true if the ACL packet is a dump packet, false otherwise. */
static bool acl_pkt_is_dump_qca(struct hci_dev *hdev, struct sk_buff *skb)
{
- u8 *sk_ptr;
- unsigned int sk_len;
-
struct hci_event_hdr *event_hdr;
struct hci_acl_hdr *acl_hdr;
struct qca_dump_hdr *dump_hdr;
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+ bool is_dump = false;
- sk_ptr = skb->data;
- sk_len = skb->len;
-
- acl_hdr = hci_acl_hdr(skb);
- if (le16_to_cpu(acl_hdr->handle) != QCA_MEMDUMP_ACL_HANDLE)
+ if (!clone)
return false;
- sk_ptr += HCI_ACL_HDR_SIZE;
- sk_len -= HCI_ACL_HDR_SIZE;
- event_hdr = (struct hci_event_hdr *)sk_ptr;
-
- if ((event_hdr->evt != HCI_VENDOR_PKT) ||
- (event_hdr->plen != (sk_len - HCI_EVENT_HDR_SIZE)))
- return false;
+ acl_hdr = skb_pull_data(clone, sizeof(*acl_hdr));
+ if (!acl_hdr || (le16_to_cpu(acl_hdr->handle) != QCA_MEMDUMP_ACL_HANDLE))
+ goto out;
- sk_ptr += HCI_EVENT_HDR_SIZE;
- sk_len -= HCI_EVENT_HDR_SIZE;
+ event_hdr = skb_pull_data(clone, sizeof(*event_hdr));
+ if (!event_hdr || (event_hdr->evt != HCI_VENDOR_PKT))
+ goto out;
- dump_hdr = (struct qca_dump_hdr *)sk_ptr;
- if ((sk_len < offsetof(struct qca_dump_hdr, data)) ||
- (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) ||
- (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE))
- return false;
+ dump_hdr = skb_pull_data(clone, sizeof(*dump_hdr));
+ if (!dump_hdr || (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) ||
+ (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE))
+ goto out;
- return true;
+ is_dump = true;
+out:
+ consume_skb(clone);
+ return is_dump;
}
/* Return: true if the event packet is a dump packet, false otherwise. */
static bool evt_pkt_is_dump_qca(struct hci_dev *hdev, struct sk_buff *skb)
{
- u8 *sk_ptr;
- unsigned int sk_len;
-
struct hci_event_hdr *event_hdr;
struct qca_dump_hdr *dump_hdr;
+ struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+ bool is_dump = false;
- sk_ptr = skb->data;
- sk_len = skb->len;
-
- event_hdr = hci_event_hdr(skb);
-
- if ((event_hdr->evt != HCI_VENDOR_PKT)
- || (event_hdr->plen != (sk_len - HCI_EVENT_HDR_SIZE)))
+ if (!clone)
return false;
- sk_ptr += HCI_EVENT_HDR_SIZE;
- sk_len -= HCI_EVENT_HDR_SIZE;
+ event_hdr = skb_pull_data(clone, sizeof(*event_hdr));
+ if (!event_hdr || (event_hdr->evt != HCI_VENDOR_PKT))
+ goto out;
- dump_hdr = (struct qca_dump_hdr *)sk_ptr;
- if ((sk_len < offsetof(struct qca_dump_hdr, data)) ||
- (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) ||
- (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE))
- return false;
+ dump_hdr = skb_pull_data(clone, sizeof(*dump_hdr));
+ if (!dump_hdr || (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) ||
+ (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE))
+ goto out;
- return true;
+ is_dump = true;
+out:
+ consume_skb(clone);
+ return is_dump;
}
static int btusb_recv_acl_qca(struct hci_dev *hdev, struct sk_buff *skb)
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index b163e043c687..21a10552da61 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3677,8 +3677,7 @@ static void cdrom_sysctl_register(void)
static void cdrom_sysctl_unregister(void)
{
- if (cdrom_sysctl_header)
- unregister_sysctl_table(cdrom_sysctl_header);
+ unregister_sysctl_table(cdrom_sysctl_header);
}
#else /* CONFIG_SYSCTL */
diff --git a/drivers/clk/clk-s2mps11.c b/drivers/clk/clk-s2mps11.c
index 014db6386624..8ddf3a9a53df 100644
--- a/drivers/clk/clk-s2mps11.c
+++ b/drivers/clk/clk-s2mps11.c
@@ -137,6 +137,8 @@ static int s2mps11_clk_probe(struct platform_device *pdev)
if (!clk_data)
return -ENOMEM;
+ clk_data->num = S2MPS11_CLKS_NUM;
+
switch (hwid) {
case S2MPS11X:
s2mps11_reg = S2MPS11_REG_RTC_CTRL;
@@ -186,7 +188,6 @@ static int s2mps11_clk_probe(struct platform_device *pdev)
clk_data->hws[i] = &s2mps11_clks[i].hw;
}
- clk_data->num = S2MPS11_CLKS_NUM;
of_clk_add_hw_provider(s2mps11_clks->clk_np, of_clk_hw_onecell_get,
clk_data);
diff --git a/drivers/clk/rockchip/clk-rk3576.c b/drivers/clk/rockchip/clk-rk3576.c
index 595e010341f7..be703f250197 100644
--- a/drivers/clk/rockchip/clk-rk3576.c
+++ b/drivers/clk/rockchip/clk-rk3576.c
@@ -541,6 +541,8 @@ static struct rockchip_clk_branch rk3576_clk_branches[] __initdata = {
RK3576_CLKGATE_CON(5), 14, GFLAGS),
GATE(CLK_OTPC_AUTO_RD_G, "clk_otpc_auto_rd_g", "xin24m", 0,
RK3576_CLKGATE_CON(5), 15, GFLAGS),
+ GATE(CLK_OTP_PHY_G, "clk_otp_phy_g", "xin24m", 0,
+ RK3576_CLKGATE_CON(6), 0, GFLAGS),
COMPOSITE(CLK_MIPI_CAMERAOUT_M0, "clk_mipi_cameraout_m0", mux_24m_spll_gpll_cpll_p, 0,
RK3576_CLKSEL_CON(38), 8, 2, MFLAGS, 0, 8, DFLAGS,
RK3576_CLKGATE_CON(6), 3, GFLAGS),
diff --git a/drivers/clk/sunxi-ng/ccu-sun20i-d1.c b/drivers/clk/sunxi-ng/ccu-sun20i-d1.c
index bb66c906ebbb..e83d4fd40240 100644
--- a/drivers/clk/sunxi-ng/ccu-sun20i-d1.c
+++ b/drivers/clk/sunxi-ng/ccu-sun20i-d1.c
@@ -412,19 +412,23 @@ static const struct clk_parent_data mmc0_mmc1_parents[] = {
{ .hw = &pll_periph0_2x_clk.common.hw },
{ .hw = &pll_audio1_div2_clk.common.hw },
};
-static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc0_clk, "mmc0", mmc0_mmc1_parents, 0x830,
- 0, 4, /* M */
- 8, 2, /* P */
- 24, 3, /* mux */
- BIT(31), /* gate */
- 0);
-
-static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc1_clk, "mmc1", mmc0_mmc1_parents, 0x834,
- 0, 4, /* M */
- 8, 2, /* P */
- 24, 3, /* mux */
- BIT(31), /* gate */
- 0);
+static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc0_clk, "mmc0",
+ mmc0_mmc1_parents, 0x830,
+ 0, 4, /* M */
+ 8, 2, /* P */
+ 24, 3, /* mux */
+ BIT(31), /* gate */
+ 2, /* post-div */
+ 0);
+
+static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc1_clk, "mmc1",
+ mmc0_mmc1_parents, 0x834,
+ 0, 4, /* M */
+ 8, 2, /* P */
+ 24, 3, /* mux */
+ BIT(31), /* gate */
+ 2, /* post-div */
+ 0);
static const struct clk_parent_data mmc2_parents[] = {
{ .fw_name = "hosc" },
@@ -433,12 +437,14 @@ static const struct clk_parent_data mmc2_parents[] = {
{ .hw = &pll_periph0_800M_clk.common.hw },
{ .hw = &pll_audio1_div2_clk.common.hw },
};
-static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc2_clk, "mmc2", mmc2_parents, 0x838,
- 0, 4, /* M */
- 8, 2, /* P */
- 24, 3, /* mux */
- BIT(31), /* gate */
- 0);
+static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc2_clk, "mmc2", mmc2_parents,
+ 0x838,
+ 0, 4, /* M */
+ 8, 2, /* P */
+ 24, 3, /* mux */
+ BIT(31), /* gate */
+ 2, /* post-div */
+ 0);
static SUNXI_CCU_GATE_HWS(bus_mmc0_clk, "bus-mmc0", psi_ahb_hws,
0x84c, BIT(0), 0);
diff --git a/drivers/clk/sunxi-ng/ccu_mp.h b/drivers/clk/sunxi-ng/ccu_mp.h
index b35aeec70484..bb09c649bfa3 100644
--- a/drivers/clk/sunxi-ng/ccu_mp.h
+++ b/drivers/clk/sunxi-ng/ccu_mp.h
@@ -52,6 +52,28 @@ struct ccu_mp {
} \
}
+#define SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(_struct, _name, _parents, \
+ _reg, \
+ _mshift, _mwidth, \
+ _pshift, _pwidth, \
+ _muxshift, _muxwidth, \
+ _gate, _postdiv, _flags)\
+ struct ccu_mp _struct = { \
+ .enable = _gate, \
+ .m = _SUNXI_CCU_DIV(_mshift, _mwidth), \
+ .p = _SUNXI_CCU_DIV(_pshift, _pwidth), \
+ .mux = _SUNXI_CCU_MUX(_muxshift, _muxwidth), \
+ .fixed_post_div = _postdiv, \
+ .common = { \
+ .reg = _reg, \
+ .features = CCU_FEATURE_FIXED_POSTDIV, \
+ .hw.init = CLK_HW_INIT_PARENTS_DATA(_name, \
+ _parents, \
+ &ccu_mp_ops, \
+ _flags), \
+ } \
+ }
+
#define SUNXI_CCU_MP_WITH_MUX_GATE(_struct, _name, _parents, _reg, \
_mshift, _mwidth, \
_pshift, _pwidth, \
@@ -109,8 +131,7 @@ struct ccu_mp {
_mshift, _mwidth, \
_pshift, _pwidth, \
_muxshift, _muxwidth, \
- _gate, _features, \
- _flags) \
+ _gate, _flags, _features) \
struct ccu_mp _struct = { \
.enable = _gate, \
.m = _SUNXI_CCU_DIV(_mshift, _mwidth), \
diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 5f8d010516f0..b1ef4546346d 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -320,8 +320,9 @@ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence,
count++;
dma_resv_list_set(fobj, i, fence, usage);
- /* pointer update must be visible before we extend the num_fences */
- smp_store_mb(fobj->num_fences, count);
+ /* fence update must be visible before we extend the num_fences */
+ smp_wmb();
+ fobj->num_fences = count;
}
EXPORT_SYMBOL(dma_resv_add_fence);
diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
index 715ac3ae067b..81339664036f 100644
--- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
+++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
@@ -342,6 +342,9 @@ static void pt_cmd_callback_work(void *data, int err)
struct pt_dma_chan *chan;
unsigned long flags;
+ if (!desc)
+ return;
+
dma_chan = desc->vd.tx.chan;
chan = to_pt_chan(dma_chan);
@@ -355,16 +358,14 @@ static void pt_cmd_callback_work(void *data, int err)
desc->status = DMA_ERROR;
spin_lock_irqsave(&chan->vc.lock, flags);
- if (desc) {
- if (desc->status != DMA_COMPLETE) {
- if (desc->status != DMA_ERROR)
- desc->status = DMA_COMPLETE;
+ if (desc->status != DMA_COMPLETE) {
+ if (desc->status != DMA_ERROR)
+ desc->status = DMA_COMPLETE;
- dma_cookie_complete(tx_desc);
- dma_descriptor_unmap(tx_desc);
- } else {
- tx_desc = NULL;
- }
+ dma_cookie_complete(tx_desc);
+ dma_descriptor_unmap(tx_desc);
+ } else {
+ tx_desc = NULL;
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index d891dfca358e..91b2fbc0b864 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -841,9 +841,9 @@ static int dmatest_func(void *data)
} else {
dma_async_issue_pending(chan);
- wait_event_timeout(thread->done_wait,
- done->done,
- msecs_to_jiffies(params->timeout));
+ wait_event_freezable_timeout(thread->done_wait,
+ done->done,
+ msecs_to_jiffies(params->timeout));
status = dma_async_is_tx_complete(chan, cookie, NULL,
NULL);
diff --git a/drivers/dma/fsl-edma-main.c b/drivers/dma/fsl-edma-main.c
index 756d67325db5..66bfa28d984e 100644
--- a/drivers/dma/fsl-edma-main.c
+++ b/drivers/dma/fsl-edma-main.c
@@ -57,7 +57,7 @@ static irqreturn_t fsl_edma3_tx_handler(int irq, void *dev_id)
intr = edma_readl_chreg(fsl_chan, ch_int);
if (!intr)
- return IRQ_HANDLED;
+ return IRQ_NONE;
edma_writel_chreg(fsl_chan, 1, ch_int);
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index ff94ee892339..6d12033649f8 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -222,7 +222,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
struct idxd_wq *wq;
struct device *dev, *fdev;
int rc = 0;
- struct iommu_sva *sva;
+ struct iommu_sva *sva = NULL;
unsigned int pasid;
struct idxd_cdev *idxd_cdev;
@@ -317,7 +317,7 @@ failed_set_pasid:
if (device_user_pasid_enabled(idxd))
idxd_xa_pasid_remove(ctx);
failed_get_pasid:
- if (device_user_pasid_enabled(idxd))
+ if (device_user_pasid_enabled(idxd) && !IS_ERR_OR_NULL(sva))
iommu_sva_unbind_device(sva);
failed:
mutex_unlock(&wq->wq_lock);
@@ -407,6 +407,9 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO))
return -EPERM;
+ if (current->mm != ctx->mm)
+ return -EPERM;
+
rc = check_vma(wq, vma, __func__);
if (rc < 0)
return rc;
@@ -473,6 +476,9 @@ static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t
ssize_t written = 0;
int i;
+ if (current->mm != ctx->mm)
+ return -EPERM;
+
for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) {
int rc = idxd_submit_user_descriptor(ctx, udesc + i);
@@ -493,6 +499,9 @@ static __poll_t idxd_cdev_poll(struct file *filp,
struct idxd_device *idxd = wq->idxd;
__poll_t out = 0;
+ if (current->mm != ctx->mm)
+ return POLLNVAL;
+
poll_wait(filp, &wq->err_queue, wait);
spin_lock(&idxd->dev_lock);
if (idxd->sw_err.valid)
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index fca1d2924999..760b7d81fcd8 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -155,6 +155,25 @@ static void idxd_cleanup_interrupts(struct idxd_device *idxd)
pci_free_irq_vectors(pdev);
}
+static void idxd_clean_wqs(struct idxd_device *idxd)
+{
+ struct idxd_wq *wq;
+ struct device *conf_dev;
+ int i;
+
+ for (i = 0; i < idxd->max_wqs; i++) {
+ wq = idxd->wqs[i];
+ if (idxd->hw.wq_cap.op_config)
+ bitmap_free(wq->opcap_bmap);
+ kfree(wq->wqcfg);
+ conf_dev = wq_confdev(wq);
+ put_device(conf_dev);
+ kfree(wq);
+ }
+ bitmap_free(idxd->wq_enable_map);
+ kfree(idxd->wqs);
+}
+
static int idxd_setup_wqs(struct idxd_device *idxd)
{
struct device *dev = &idxd->pdev->dev;
@@ -169,8 +188,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev));
if (!idxd->wq_enable_map) {
- kfree(idxd->wqs);
- return -ENOMEM;
+ rc = -ENOMEM;
+ goto err_bitmap;
}
for (i = 0; i < idxd->max_wqs; i++) {
@@ -189,10 +208,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
conf_dev->bus = &dsa_bus_type;
conf_dev->type = &idxd_wq_device_type;
rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id);
- if (rc < 0) {
- put_device(conf_dev);
+ if (rc < 0)
goto err;
- }
mutex_init(&wq->wq_lock);
init_waitqueue_head(&wq->err_queue);
@@ -203,7 +220,6 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES;
wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev));
if (!wq->wqcfg) {
- put_device(conf_dev);
rc = -ENOMEM;
goto err;
}
@@ -211,9 +227,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
if (idxd->hw.wq_cap.op_config) {
wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL);
if (!wq->opcap_bmap) {
- put_device(conf_dev);
rc = -ENOMEM;
- goto err;
+ goto err_opcap_bmap;
}
bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS);
}
@@ -224,15 +239,46 @@ static int idxd_setup_wqs(struct idxd_device *idxd)
return 0;
- err:
+err_opcap_bmap:
+ kfree(wq->wqcfg);
+
+err:
+ put_device(conf_dev);
+ kfree(wq);
+
while (--i >= 0) {
wq = idxd->wqs[i];
+ if (idxd->hw.wq_cap.op_config)
+ bitmap_free(wq->opcap_bmap);
+ kfree(wq->wqcfg);
conf_dev = wq_confdev(wq);
put_device(conf_dev);
+ kfree(wq);
+
}
+ bitmap_free(idxd->wq_enable_map);
+
+err_bitmap:
+ kfree(idxd->wqs);
+
return rc;
}
+static void idxd_clean_engines(struct idxd_device *idxd)
+{
+ struct idxd_engine *engine;
+ struct device *conf_dev;
+ int i;
+
+ for (i = 0; i < idxd->max_engines; i++) {
+ engine = idxd->engines[i];
+ conf_dev = engine_confdev(engine);
+ put_device(conf_dev);
+ kfree(engine);
+ }
+ kfree(idxd->engines);
+}
+
static int idxd_setup_engines(struct idxd_device *idxd)
{
struct idxd_engine *engine;
@@ -263,6 +309,7 @@ static int idxd_setup_engines(struct idxd_device *idxd)
rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id);
if (rc < 0) {
put_device(conf_dev);
+ kfree(engine);
goto err;
}
@@ -276,10 +323,26 @@ static int idxd_setup_engines(struct idxd_device *idxd)
engine = idxd->engines[i];
conf_dev = engine_confdev(engine);
put_device(conf_dev);
+ kfree(engine);
}
+ kfree(idxd->engines);
+
return rc;
}
+static void idxd_clean_groups(struct idxd_device *idxd)
+{
+ struct idxd_group *group;
+ int i;
+
+ for (i = 0; i < idxd->max_groups; i++) {
+ group = idxd->groups[i];
+ put_device(group_confdev(group));
+ kfree(group);
+ }
+ kfree(idxd->groups);
+}
+
static int idxd_setup_groups(struct idxd_device *idxd)
{
struct device *dev = &idxd->pdev->dev;
@@ -310,6 +373,7 @@ static int idxd_setup_groups(struct idxd_device *idxd)
rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id);
if (rc < 0) {
put_device(conf_dev);
+ kfree(group);
goto err;
}
@@ -334,20 +398,18 @@ static int idxd_setup_groups(struct idxd_device *idxd)
while (--i >= 0) {
group = idxd->groups[i];
put_device(group_confdev(group));
+ kfree(group);
}
+ kfree(idxd->groups);
+
return rc;
}
static void idxd_cleanup_internals(struct idxd_device *idxd)
{
- int i;
-
- for (i = 0; i < idxd->max_groups; i++)
- put_device(group_confdev(idxd->groups[i]));
- for (i = 0; i < idxd->max_engines; i++)
- put_device(engine_confdev(idxd->engines[i]));
- for (i = 0; i < idxd->max_wqs; i++)
- put_device(wq_confdev(idxd->wqs[i]));
+ idxd_clean_groups(idxd);
+ idxd_clean_engines(idxd);
+ idxd_clean_wqs(idxd);
destroy_workqueue(idxd->wq);
}
@@ -390,7 +452,7 @@ static int idxd_init_evl(struct idxd_device *idxd)
static int idxd_setup_internals(struct idxd_device *idxd)
{
struct device *dev = &idxd->pdev->dev;
- int rc, i;
+ int rc;
init_waitqueue_head(&idxd->cmd_waitq);
@@ -421,14 +483,11 @@ static int idxd_setup_internals(struct idxd_device *idxd)
err_evl:
destroy_workqueue(idxd->wq);
err_wkq_create:
- for (i = 0; i < idxd->max_groups; i++)
- put_device(group_confdev(idxd->groups[i]));
+ idxd_clean_groups(idxd);
err_group:
- for (i = 0; i < idxd->max_engines; i++)
- put_device(engine_confdev(idxd->engines[i]));
+ idxd_clean_engines(idxd);
err_engine:
- for (i = 0; i < idxd->max_wqs; i++)
- put_device(wq_confdev(idxd->wqs[i]));
+ idxd_clean_wqs(idxd);
err_wqs:
return rc;
}
@@ -528,6 +587,17 @@ static void idxd_read_caps(struct idxd_device *idxd)
idxd->hw.iaa_cap.bits = ioread64(idxd->reg_base + IDXD_IAACAP_OFFSET);
}
+static void idxd_free(struct idxd_device *idxd)
+{
+ if (!idxd)
+ return;
+
+ put_device(idxd_confdev(idxd));
+ bitmap_free(idxd->opcap_bmap);
+ ida_free(&idxd_ida, idxd->id);
+ kfree(idxd);
+}
+
static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data)
{
struct device *dev = &pdev->dev;
@@ -545,28 +615,34 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d
idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type);
idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL);
if (idxd->id < 0)
- return NULL;
+ goto err_ida;
idxd->opcap_bmap = bitmap_zalloc_node(IDXD_MAX_OPCAP_BITS, GFP_KERNEL, dev_to_node(dev));
- if (!idxd->opcap_bmap) {
- ida_free(&idxd_ida, idxd->id);
- return NULL;
- }
+ if (!idxd->opcap_bmap)
+ goto err_opcap;
device_initialize(conf_dev);
conf_dev->parent = dev;
conf_dev->bus = &dsa_bus_type;
conf_dev->type = idxd->data->dev_type;
rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id);
- if (rc < 0) {
- put_device(conf_dev);
- return NULL;
- }
+ if (rc < 0)
+ goto err_name;
spin_lock_init(&idxd->dev_lock);
spin_lock_init(&idxd->cmd_lock);
return idxd;
+
+err_name:
+ put_device(conf_dev);
+ bitmap_free(idxd->opcap_bmap);
+err_opcap:
+ ida_free(&idxd_ida, idxd->id);
+err_ida:
+ kfree(idxd);
+
+ return NULL;
}
static int idxd_enable_system_pasid(struct idxd_device *idxd)
@@ -1190,7 +1266,7 @@ int idxd_pci_probe_alloc(struct idxd_device *idxd, struct pci_dev *pdev,
err:
pci_iounmap(pdev, idxd->reg_base);
err_iomap:
- put_device(idxd_confdev(idxd));
+ idxd_free(idxd);
err_idxd_alloc:
pci_disable_device(pdev);
return rc;
@@ -1232,7 +1308,6 @@ static void idxd_shutdown(struct pci_dev *pdev)
static void idxd_remove(struct pci_dev *pdev)
{
struct idxd_device *idxd = pci_get_drvdata(pdev);
- struct idxd_irq_entry *irq_entry;
idxd_unregister_devices(idxd);
/*
@@ -1245,20 +1320,12 @@ static void idxd_remove(struct pci_dev *pdev)
get_device(idxd_confdev(idxd));
device_unregister(idxd_confdev(idxd));
idxd_shutdown(pdev);
- if (device_pasid_enabled(idxd))
- idxd_disable_system_pasid(idxd);
idxd_device_remove_debugfs(idxd);
-
- irq_entry = idxd_get_ie(idxd, 0);
- free_irq(irq_entry->vector, irq_entry);
- pci_free_irq_vectors(pdev);
+ idxd_cleanup(idxd);
pci_iounmap(pdev, idxd->reg_base);
- if (device_user_pasid_enabled(idxd))
- idxd_disable_sva(pdev);
- pci_disable_device(pdev);
- destroy_workqueue(idxd->wq);
- perfmon_pmu_remove(idxd);
put_device(idxd_confdev(idxd));
+ idxd_free(idxd);
+ pci_disable_device(pdev);
}
static struct pci_driver idxd_pci_driver = {
diff --git a/drivers/dma/mediatek/mtk-cqdma.c b/drivers/dma/mediatek/mtk-cqdma.c
index d5ddb4e30e71..47c8adfdc155 100644
--- a/drivers/dma/mediatek/mtk-cqdma.c
+++ b/drivers/dma/mediatek/mtk-cqdma.c
@@ -420,15 +420,11 @@ static struct virt_dma_desc *mtk_cqdma_find_active_desc(struct dma_chan *c,
{
struct mtk_cqdma_vchan *cvc = to_cqdma_vchan(c);
struct virt_dma_desc *vd;
- unsigned long flags;
- spin_lock_irqsave(&cvc->pc->lock, flags);
list_for_each_entry(vd, &cvc->pc->queue, node)
if (vd->tx.cookie == cookie) {
- spin_unlock_irqrestore(&cvc->pc->lock, flags);
return vd;
}
- spin_unlock_irqrestore(&cvc->pc->lock, flags);
list_for_each_entry(vd, &cvc->vc.desc_issued, node)
if (vd->tx.cookie == cookie)
@@ -452,9 +448,11 @@ static enum dma_status mtk_cqdma_tx_status(struct dma_chan *c,
if (ret == DMA_COMPLETE || !txstate)
return ret;
+ spin_lock_irqsave(&cvc->pc->lock, flags);
spin_lock_irqsave(&cvc->vc.lock, flags);
vd = mtk_cqdma_find_active_desc(c, cookie);
spin_unlock_irqrestore(&cvc->vc.lock, flags);
+ spin_unlock_irqrestore(&cvc->pc->lock, flags);
if (vd) {
cvd = to_cqdma_vdesc(vd);
diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index b223a7aacb0c..b6255c0601bb 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -1091,8 +1091,11 @@ static void udma_check_tx_completion(struct work_struct *work)
u32 residue_diff;
ktime_t time_diff;
unsigned long delay;
+ unsigned long flags;
while (1) {
+ spin_lock_irqsave(&uc->vc.lock, flags);
+
if (uc->desc) {
/* Get previous residue and time stamp */
residue_diff = uc->tx_drain.residue;
@@ -1127,6 +1130,8 @@ static void udma_check_tx_completion(struct work_struct *work)
break;
}
+ spin_unlock_irqrestore(&uc->vc.lock, flags);
+
usleep_range(ktime_to_us(delay),
ktime_to_us(delay) + 10);
continue;
@@ -1143,6 +1148,8 @@ static void udma_check_tx_completion(struct work_struct *work)
break;
}
+
+ spin_unlock_irqrestore(&uc->vc.lock, flags);
}
static irqreturn_t udma_ring_irq_handler(int irq, void *data)
@@ -4246,7 +4253,6 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec,
struct of_dma *ofdma)
{
struct udma_dev *ud = ofdma->of_dma_data;
- dma_cap_mask_t mask = ud->ddev.cap_mask;
struct udma_filter_param filter_param;
struct dma_chan *chan;
@@ -4278,7 +4284,7 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec,
}
}
- chan = __dma_request_channel(&mask, udma_dma_filter_fn, &filter_param,
+ chan = __dma_request_channel(&ud->ddev.cap_mask, udma_dma_filter_fn, &filter_param,
ofdma->of_node);
if (!chan) {
dev_err(ud->dev, "get channel fail in %s.\n", __func__);
diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c
index a85b2dbdd9f0..15e991b99f5a 100644
--- a/drivers/firmware/samsung/exynos-acpm.c
+++ b/drivers/firmware/samsung/exynos-acpm.c
@@ -185,6 +185,29 @@ struct acpm_match_data {
#define handle_to_acpm_info(h) container_of(h, struct acpm_info, handle)
/**
+ * acpm_get_saved_rx() - get the response if it was already saved.
+ * @achan: ACPM channel info.
+ * @xfer: reference to the transfer to get response for.
+ * @tx_seqnum: xfer TX sequence number.
+ */
+static void acpm_get_saved_rx(struct acpm_chan *achan,
+ const struct acpm_xfer *xfer, u32 tx_seqnum)
+{
+ const struct acpm_rx_data *rx_data = &achan->rx_data[tx_seqnum - 1];
+ u32 rx_seqnum;
+
+ if (!rx_data->response)
+ return;
+
+ rx_seqnum = FIELD_GET(ACPM_PROTOCOL_SEQNUM, rx_data->cmd[0]);
+
+ if (rx_seqnum == tx_seqnum) {
+ memcpy(xfer->rxd, rx_data->cmd, xfer->rxlen);
+ clear_bit(rx_seqnum - 1, achan->bitmap_seqnum);
+ }
+}
+
+/**
* acpm_get_rx() - get response from RX queue.
* @achan: ACPM channel info.
* @xfer: reference to the transfer to get response for.
@@ -204,15 +227,16 @@ static int acpm_get_rx(struct acpm_chan *achan, const struct acpm_xfer *xfer)
rx_front = readl(achan->rx.front);
i = readl(achan->rx.rear);
- /* Bail out if RX is empty. */
- if (i == rx_front)
+ tx_seqnum = FIELD_GET(ACPM_PROTOCOL_SEQNUM, xfer->txd[0]);
+
+ if (i == rx_front) {
+ acpm_get_saved_rx(achan, xfer, tx_seqnum);
return 0;
+ }
base = achan->rx.base;
mlen = achan->mlen;
- tx_seqnum = FIELD_GET(ACPM_PROTOCOL_SEQNUM, xfer->txd[0]);
-
/* Drain RX queue. */
do {
/* Read RX seqnum. */
@@ -259,16 +283,8 @@ static int acpm_get_rx(struct acpm_chan *achan, const struct acpm_xfer *xfer)
* If the response was not in this iteration of the queue, check if the
* RX data was previously saved.
*/
- rx_data = &achan->rx_data[tx_seqnum - 1];
- if (!rx_set && rx_data->response) {
- rx_seqnum = FIELD_GET(ACPM_PROTOCOL_SEQNUM,
- rx_data->cmd[0]);
-
- if (rx_seqnum == tx_seqnum) {
- memcpy(xfer->rxd, rx_data->cmd, xfer->rxlen);
- clear_bit(rx_seqnum - 1, achan->bitmap_seqnum);
- }
- }
+ if (!rx_set)
+ acpm_get_saved_rx(achan, xfer, tx_seqnum);
return 0;
}
diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c
index 442435ded020..13cc120cf11f 100644
--- a/drivers/gpio/gpio-pca953x.c
+++ b/drivers/gpio/gpio-pca953x.c
@@ -1204,6 +1204,8 @@ static int pca953x_restore_context(struct pca953x_chip *chip)
guard(mutex)(&chip->i2c_lock);
+ if (chip->client->irq > 0)
+ enable_irq(chip->client->irq);
regcache_cache_only(chip->regmap, false);
regcache_mark_dirty(chip->regmap);
ret = pca953x_regcache_sync(chip);
@@ -1216,6 +1218,10 @@ static int pca953x_restore_context(struct pca953x_chip *chip)
static void pca953x_save_context(struct pca953x_chip *chip)
{
guard(mutex)(&chip->i2c_lock);
+
+ /* Disable IRQ to prevent early triggering while regmap "cache only" is on */
+ if (chip->client->irq > 0)
+ disable_irq(chip->client->irq);
regcache_cache_only(chip->regmap, true);
}
diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c
index 13407fd4f0eb..eab6726953b4 100644
--- a/drivers/gpio/gpio-virtuser.c
+++ b/drivers/gpio/gpio-virtuser.c
@@ -401,10 +401,15 @@ static ssize_t gpio_virtuser_direction_do_write(struct file *file,
char buf[32], *trimmed;
int ret, dir, val = 0;
- ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ ret = simple_write_to_buffer(buf, sizeof(buf) - 1, ppos, user_buf, count);
if (ret < 0)
return ret;
+ buf[ret] = '\0';
+
trimmed = strim(buf);
if (strcmp(trimmed, "input") == 0) {
@@ -623,12 +628,15 @@ static ssize_t gpio_virtuser_consumer_write(struct file *file,
char buf[GPIO_VIRTUSER_NAME_BUF_LEN + 2];
int ret;
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
ret = simple_write_to_buffer(buf, GPIO_VIRTUSER_NAME_BUF_LEN, ppos,
user_buf, count);
if (ret < 0)
return ret;
- buf[strlen(buf) - 1] = '\0';
+ buf[ret] = '\0';
ret = gpiod_set_consumer_name(data->ad.desc, buf);
if (ret)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index cd4fecbb41f2..113c5d90f2df 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -742,6 +742,12 @@ EXPORT_SYMBOL_GPL(gpiochip_query_valid_mask);
bool gpiochip_line_is_valid(const struct gpio_chip *gc,
unsigned int offset)
{
+ /*
+ * hog pins are requested before registering GPIO chip
+ */
+ if (!gc->gpiodev)
+ return true;
+
/* No mask means all valid */
if (likely(!gc->gpiodev->valid_mask))
return true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
index cfdf558b48b6..02138aa55793 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c
@@ -109,7 +109,7 @@ int amdgpu_unmap_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct drm_exec exec;
int r;
- drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+ drm_exec_init(&exec, 0, 0);
drm_exec_until_all_locked(&exec) {
r = amdgpu_vm_lock_pd(vm, &exec, 0);
if (likely(!r))
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index e74e26b6a4f2..fec9a007533a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -752,6 +752,18 @@ static int gmc_v11_0_sw_init(struct amdgpu_ip_block *ip_block)
adev->gmc.vram_type = vram_type;
adev->gmc.vram_vendor = vram_vendor;
+ /* The mall_size is already calculated as mall_size_per_umc * num_umc.
+ * However, for gfx1151, which features a 2-to-1 UMC mapping,
+ * the result must be multiplied by 2 to determine the actual mall size.
+ */
+ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(11, 5, 1):
+ adev->gmc.mall_size *= 2;
+ break;
+ default:
+ break;
+ }
+
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 1):
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index a1171e6152ed..f11df9c2ec13 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -1023,6 +1023,10 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT |
VCN_RB1_DB_CTRL__EN_MASK);
+ /* Keeping one read-back to ensure all register writes are done, otherwise
+ * it may introduce race conditions */
+ RREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL);
+
return 0;
}
@@ -1205,6 +1209,10 @@ static int vcn_v4_0_5_start(struct amdgpu_vcn_inst *vinst)
WREG32_SOC15(VCN, i, regVCN_RB_ENABLE, tmp);
fw_shared->sq.queue_mode &= ~(FW_QUEUE_RING_RESET | FW_QUEUE_DPG_HOLD_OFF);
+ /* Keeping one read-back to ensure all register writes are done, otherwise
+ * it may introduce race conditions */
+ RREG32_SOC15(VCN, i, regVCN_RB_ENABLE);
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 64df8ca448b3..a187cdb43e7e 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -372,6 +372,8 @@ get_crtc_by_otg_inst(struct amdgpu_device *adev,
static inline bool is_dc_timing_adjust_needed(struct dm_crtc_state *old_state,
struct dm_crtc_state *new_state)
{
+ if (new_state->stream->adjust.timing_adjust_pending)
+ return true;
if (new_state->freesync_config.state == VRR_STATE_ACTIVE_FIXED)
return true;
else if (amdgpu_dm_crtc_vrr_active(old_state) != amdgpu_dm_crtc_vrr_active(new_state))
@@ -3467,11 +3469,6 @@ static int dm_resume(struct amdgpu_ip_block *ip_block)
return 0;
}
-
- /* leave display off for S4 sequence */
- if (adev->in_s4)
- return 0;
-
/* Recreate dc_state - DC invalidates it when setting power state to S3. */
dc_state_release(dm_state->context);
dm_state->context = dc_state_create(dm->dc, NULL);
@@ -12763,7 +12760,8 @@ int amdgpu_dm_process_dmub_aux_transfer_sync(
/* The reply is stored in the top nibble of the command. */
payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF;
- if (!payload->write && p_notify->aux_reply.length)
+ /*write req may receive a byte indicating partially written number as well*/
+ if (p_notify->aux_reply.length)
memcpy(payload->data, p_notify->aux_reply.data,
p_notify->aux_reply.length);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 074b79fd5822..5cdbc86ef8f5 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -62,6 +62,7 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux,
enum aux_return_code_type operation_result;
struct amdgpu_device *adev;
struct ddc_service *ddc;
+ uint8_t copy[16];
if (WARN_ON(msg->size > 16))
return -E2BIG;
@@ -77,6 +78,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux,
(msg->request & DP_AUX_I2C_WRITE_STATUS_UPDATE) != 0;
payload.defer_delay = 0;
+ if (payload.write) {
+ memcpy(copy, msg->buffer, msg->size);
+ payload.data = copy;
+ }
+
result = dc_link_aux_transfer_raw(TO_DM_AUX(aux)->ddc_service, &payload,
&operation_result);
@@ -100,9 +106,9 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux,
*/
if (payload.write && result >= 0) {
if (result) {
- /*one byte indicating partially written bytes. Force 0 to retry*/
- drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n");
- result = 0;
+ /*one byte indicating partially written bytes*/
+ drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX partially written\n");
+ result = payload.data[0];
} else if (!payload.reply[0])
/*I2C_ACK|AUX_ACK*/
result = msg->size;
@@ -127,11 +133,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux,
break;
}
- drm_info(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result);
+ drm_dbg_dp(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result);
}
if (payload.reply[0])
- drm_info(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.",
+ drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.",
payload.reply[0]);
return result;
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 28d1353f403d..ba4ce8a63158 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -439,9 +439,12 @@ bool dc_stream_adjust_vmin_vmax(struct dc *dc,
* Don't adjust DRR while there's bandwidth optimizations pending to
* avoid conflicting with firmware updates.
*/
- if (dc->ctx->dce_version > DCE_VERSION_MAX)
- if (dc->optimized_required || dc->wm_optimized_required)
+ if (dc->ctx->dce_version > DCE_VERSION_MAX) {
+ if (dc->optimized_required || dc->wm_optimized_required) {
+ stream->adjust.timing_adjust_pending = true;
return false;
+ }
+ }
dc_exit_ips_for_hw_access(dc);
@@ -3168,7 +3171,8 @@ static void copy_stream_update_to_stream(struct dc *dc,
if (update->crtc_timing_adjust) {
if (stream->adjust.v_total_min != update->crtc_timing_adjust->v_total_min ||
- stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max)
+ stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max ||
+ stream->adjust.timing_adjust_pending)
update->crtc_timing_adjust->timing_adjust_pending = true;
stream->adjust = *update->crtc_timing_adjust;
update->crtc_timing_adjust->timing_adjust_pending = false;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c
index d9159ca55412..92f0a099d089 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c
@@ -195,9 +195,9 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = {
.dcn_downspread_percent = 0.5,
.gpuvm_min_page_size_bytes = 4096,
.hostvm_min_page_size_bytes = 4096,
- .do_urgent_latency_adjustment = 1,
+ .do_urgent_latency_adjustment = 0,
.urgent_latency_adjustment_fabric_clock_component_us = 0,
- .urgent_latency_adjustment_fabric_clock_reference_mhz = 3000,
+ .urgent_latency_adjustment_fabric_clock_reference_mhz = 0,
};
void dcn35_build_wm_range_table_fpu(struct clk_mgr *clk_mgr)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
index 0c8ec30ea672..731fbd4bc600 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
@@ -910,7 +910,7 @@ static void populate_dml21_plane_config_from_plane_state(struct dml2_context *dm
}
//TODO : Could be possibly moved to a common helper layer.
-static bool dml21_wrapper_get_plane_id(const struct dc_state *context, const struct dc_plane_state *plane, unsigned int *plane_id)
+static bool dml21_wrapper_get_plane_id(const struct dc_state *context, unsigned int stream_id, const struct dc_plane_state *plane, unsigned int *plane_id)
{
int i, j;
@@ -918,10 +918,12 @@ static bool dml21_wrapper_get_plane_id(const struct dc_state *context, const str
return false;
for (i = 0; i < context->stream_count; i++) {
- for (j = 0; j < context->stream_status[i].plane_count; j++) {
- if (context->stream_status[i].plane_states[j] == plane) {
- *plane_id = (i << 16) | j;
- return true;
+ if (context->streams[i]->stream_id == stream_id) {
+ for (j = 0; j < context->stream_status[i].plane_count; j++) {
+ if (context->stream_status[i].plane_states[j] == plane) {
+ *plane_id = (i << 16) | j;
+ return true;
+ }
}
}
}
@@ -944,14 +946,14 @@ static unsigned int map_stream_to_dml21_display_cfg(const struct dml2_context *d
return location;
}
-static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx,
+static unsigned int map_plane_to_dml21_display_cfg(const struct dml2_context *dml_ctx, unsigned int stream_id,
const struct dc_plane_state *plane, const struct dc_state *context)
{
unsigned int plane_id;
int i = 0;
int location = -1;
- if (!dml21_wrapper_get_plane_id(context, plane, &plane_id)) {
+ if (!dml21_wrapper_get_plane_id(context, stream_id, plane, &plane_id)) {
ASSERT(false);
return -1;
}
@@ -1037,7 +1039,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s
dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location;
} else {
for (plane_index = 0; plane_index < context->stream_status[stream_index].plane_count; plane_index++) {
- disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->stream_status[stream_index].plane_states[plane_index], context);
+ disp_cfg_plane_location = map_plane_to_dml21_display_cfg(dml_ctx, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], context);
if (disp_cfg_plane_location < 0)
disp_cfg_plane_location = dml_dispcfg->num_planes++;
@@ -1048,7 +1050,7 @@ bool dml21_map_dc_state_into_dml_display_cfg(const struct dc *in_dc, struct dc_s
populate_dml21_plane_config_from_plane_state(dml_ctx, &dml_dispcfg->plane_descriptors[disp_cfg_plane_location], context->stream_status[stream_index].plane_states[plane_index], context, stream_index);
dml_dispcfg->plane_descriptors[disp_cfg_plane_location].stream_index = disp_cfg_stream_location;
- if (dml21_wrapper_get_plane_id(context, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location]))
+ if (dml21_wrapper_get_plane_id(context, context->streams[stream_index]->stream_id, context->stream_status[stream_index].plane_states[plane_index], &dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[disp_cfg_plane_location]))
dml_ctx->v21.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id_valid[disp_cfg_plane_location] = true;
/* apply forced pstate policy */
diff --git a/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c b/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c
index 1236e0f9a256..712aff7e17f7 100644
--- a/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c
+++ b/drivers/gpu/drm/amd/display/dc/dpp/dcn401/dcn401_dpp_cm.c
@@ -120,10 +120,11 @@ void dpp401_set_cursor_attributes(
enum dc_cursor_color_format color_format = cursor_attributes->color_format;
int cur_rom_en = 0;
- // DCN4 should always do Cursor degamma for Cursor Color modes
if (color_format == CURSOR_MODE_COLOR_PRE_MULTIPLIED_ALPHA ||
color_format == CURSOR_MODE_COLOR_UN_PRE_MULTIPLIED_ALPHA) {
- cur_rom_en = 1;
+ if (cursor_attributes->attribute_flags.bits.ENABLE_CURSOR_DEGAMMA) {
+ cur_rom_en = 1;
+ }
}
REG_UPDATE_3(CURSOR0_CONTROL,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
index 5489f3d431f6..3af6a3402b89 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
@@ -1980,9 +1980,9 @@ void dcn401_program_pipe(
dc->res_pool->hubbub, pipe_ctx->plane_res.hubp->inst, pipe_ctx->hubp_regs.det_size);
}
- if (pipe_ctx->update_flags.raw ||
- (pipe_ctx->plane_state && pipe_ctx->plane_state->update_flags.raw) ||
- pipe_ctx->stream->update_flags.raw)
+ if (pipe_ctx->plane_state && (pipe_ctx->update_flags.raw ||
+ pipe_ctx->plane_state->update_flags.raw ||
+ pipe_ctx->stream->update_flags.raw))
dc->hwss.update_dchubp_dpp(dc, pipe_ctx, context);
if (pipe_ctx->plane_state && (pipe_ctx->update_flags.bits.enable ||
diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
index 268626e73c54..53c961f86d43 100644
--- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
+++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
@@ -148,6 +148,7 @@ void link_blank_dp_stream(struct dc_link *link, bool hw_init)
void link_set_all_streams_dpms_off_for_link(struct dc_link *link)
{
struct pipe_ctx *pipes[MAX_PIPES];
+ struct dc_stream_state *streams[MAX_PIPES];
struct dc_state *state = link->dc->current_state;
uint8_t count;
int i;
@@ -160,10 +161,18 @@ void link_set_all_streams_dpms_off_for_link(struct dc_link *link)
link_get_master_pipes_with_dpms_on(link, state, &count, pipes);
+ /* The subsequent call to dc_commit_updates_for_stream for a full update
+ * will release the current state and swap to a new state. Releasing the
+ * current state results in the stream pointers in the pipe_ctx structs
+ * to be zero'd. Hence, cache all streams prior to dc_commit_updates_for_stream.
+ */
+ for (i = 0; i < count; i++)
+ streams[i] = pipes[i]->stream;
+
for (i = 0; i < count; i++) {
- stream_update.stream = pipes[i]->stream;
+ stream_update.stream = streams[i];
dc_commit_updates_for_stream(link->ctx->dc, NULL, 0,
- pipes[i]->stream, &stream_update,
+ streams[i], &stream_update,
state);
}
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 13bc4c290b17..9edb3247c767 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -6596,6 +6596,7 @@ static void drm_reset_display_info(struct drm_connector *connector)
info->has_hdmi_infoframe = false;
info->rgb_quant_range_selectable = false;
memset(&info->hdmi, 0, sizeof(info->hdmi));
+ memset(&connector->hdr_sink_metadata, 0, sizeof(connector->hdr_sink_metadata));
info->edid_hdmi_rgb444_dc_modes = 0;
info->edid_hdmi_ycbcr444_dc_modes = 0;
diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c
index de424e670995..4b2f32889f00 100644
--- a/drivers/gpu/drm/drm_gpusvm.c
+++ b/drivers/gpu/drm/drm_gpusvm.c
@@ -1118,6 +1118,10 @@ static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
lockdep_assert_held(&gpusvm->notifier_lock);
if (range->flags.has_dma_mapping) {
+ struct drm_gpusvm_range_flags flags = {
+ .__flags = range->flags.__flags,
+ };
+
for (i = 0, j = 0; i < npages; j++) {
struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
@@ -1131,8 +1135,12 @@ static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
dev, *addr);
i += 1 << addr->order;
}
- range->flags.has_devmem_pages = false;
- range->flags.has_dma_mapping = false;
+
+ /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
+ flags.has_devmem_pages = false;
+ flags.has_dma_mapping = false;
+ WRITE_ONCE(range->flags.__flags, flags.__flags);
+
range->dpagemap = NULL;
}
}
@@ -1334,6 +1342,7 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
int err = 0;
struct dev_pagemap *pagemap;
struct drm_pagemap *dpagemap;
+ struct drm_gpusvm_range_flags flags;
retry:
hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
@@ -1378,7 +1387,8 @@ map_pages:
*/
drm_gpusvm_notifier_lock(gpusvm);
- if (range->flags.unmapped) {
+ flags.__flags = range->flags.__flags;
+ if (flags.unmapped) {
drm_gpusvm_notifier_unlock(gpusvm);
err = -EFAULT;
goto err_free;
@@ -1454,6 +1464,11 @@ map_pages:
goto err_unmap;
}
+ if (ctx->devmem_only) {
+ err = -EFAULT;
+ goto err_unmap;
+ }
+
addr = dma_map_page(gpusvm->drm->dev,
page, 0,
PAGE_SIZE << order,
@@ -1469,14 +1484,17 @@ map_pages:
}
i += 1 << order;
num_dma_mapped = i;
- range->flags.has_dma_mapping = true;
+ flags.has_dma_mapping = true;
}
if (zdd) {
- range->flags.has_devmem_pages = true;
+ flags.has_devmem_pages = true;
range->dpagemap = dpagemap;
}
+ /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
+ WRITE_ONCE(range->flags.__flags, flags.__flags);
+
drm_gpusvm_notifier_unlock(gpusvm);
kvfree(pfns);
set_seqno:
@@ -1765,6 +1783,8 @@ int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
goto err_finalize;
/* Upon success bind devmem allocation to range and zdd */
+ devmem_allocation->timeslice_expiration = get_jiffies_64() +
+ msecs_to_jiffies(ctx->timeslice_ms);
zdd->devmem_allocation = devmem_allocation; /* Owns ref */
err_finalize:
@@ -1985,6 +2005,13 @@ static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
void *buf;
int i, err = 0;
+ if (page) {
+ zdd = page->zone_device_data;
+ if (time_before64(get_jiffies_64(),
+ zdd->devmem_allocation->timeslice_expiration))
+ return 0;
+ }
+
start = ALIGN_DOWN(fault_addr, size);
end = ALIGN(fault_addr + 1, size);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index ae3343c81a64..5e784db9f315 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -305,36 +305,20 @@ void __shmem_writeback(size_t size, struct address_space *mapping)
.range_end = LLONG_MAX,
.for_reclaim = 1,
};
- unsigned long i;
+ struct folio *folio = NULL;
+ int error = 0;
/*
* Leave mmapings intact (GTT will have been revoked on unbinding,
- * leaving only CPU mmapings around) and add those pages to the LRU
+ * leaving only CPU mmapings around) and add those folios to the LRU
* instead of invoking writeback so they are aged and paged out
* as normal.
*/
-
- /* Begin writeback on each dirty page */
- for (i = 0; i < size >> PAGE_SHIFT; i++) {
- struct page *page;
-
- page = find_lock_page(mapping, i);
- if (!page)
- continue;
-
- if (!page_mapped(page) && clear_page_dirty_for_io(page)) {
- int ret;
-
- SetPageReclaim(page);
- ret = mapping->a_ops->writepage(page, &wbc);
- if (!PageWriteback(page))
- ClearPageReclaim(page);
- if (!ret)
- goto put;
- }
- unlock_page(page);
-put:
- put_page(page);
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (folio_mapped(folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ else
+ error = shmem_writeout(folio, &wbc);
}
}
diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c
index 7752d8ac85f0..c08fa93e50a3 100644
--- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c
+++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c
@@ -75,7 +75,7 @@ static void meson_encoder_hdmi_set_vclk(struct meson_encoder_hdmi *encoder_hdmi,
unsigned long long venc_freq;
unsigned long long hdmi_freq;
- vclk_freq = mode->clock * 1000;
+ vclk_freq = mode->clock * 1000ULL;
/* For 420, pixel clock is half unlike venc clock */
if (encoder_hdmi->output_bus_fmt == MEDIA_BUS_FMT_UYYVYY8_0_5X24)
@@ -123,7 +123,7 @@ static enum drm_mode_status meson_encoder_hdmi_mode_valid(struct drm_bridge *bri
struct meson_encoder_hdmi *encoder_hdmi = bridge_to_meson_encoder_hdmi(bridge);
struct meson_drm *priv = encoder_hdmi->priv;
bool is_hdmi2_sink = display_info->hdmi.scdc.supported;
- unsigned long long clock = mode->clock * 1000;
+ unsigned long long clock = mode->clock * 1000ULL;
unsigned long long phy_freq;
unsigned long long vclk_freq;
unsigned long long venc_freq;
diff --git a/drivers/gpu/drm/tiny/panel-mipi-dbi.c b/drivers/gpu/drm/tiny/panel-mipi-dbi.c
index 0460ecaef4bd..23914a9f7fd3 100644
--- a/drivers/gpu/drm/tiny/panel-mipi-dbi.c
+++ b/drivers/gpu/drm/tiny/panel-mipi-dbi.c
@@ -390,7 +390,10 @@ static int panel_mipi_dbi_spi_probe(struct spi_device *spi)
spi_set_drvdata(spi, drm);
- drm_client_setup(drm, NULL);
+ if (bpp == 16)
+ drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB565);
+ else
+ drm_client_setup_with_fourcc(drm, DRM_FORMAT_RGB888);
return 0;
}
diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c
index 9e2d72c447ee..ffaab68bd5dd 100644
--- a/drivers/gpu/drm/ttm/ttm_backup.c
+++ b/drivers/gpu/drm/ttm/ttm_backup.c
@@ -120,13 +120,13 @@ ttm_backup_backup_page(struct file *backup, struct page *page,
.for_reclaim = 1,
};
folio_set_reclaim(to_folio);
- ret = mapping->a_ops->writepage(folio_file_page(to_folio, idx), &wbc);
+ ret = shmem_writeout(to_folio, &wbc);
if (!folio_test_writeback(to_folio))
folio_clear_reclaim(to_folio);
/*
- * If writepage succeeds, it unlocks the folio.
- * writepage() errors are otherwise dropped, since writepage()
- * is only best effort here.
+ * If writeout succeeds, it unlocks the folio. errors
+ * are otherwise dropped, since writeout is only best
+ * effort here.
*/
if (ret)
folio_unlock(to_folio);
diff --git a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
index 167fb0f742de..5a47991b4b81 100644
--- a/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
+++ b/drivers/gpu/drm/xe/instructions/xe_mi_commands.h
@@ -47,6 +47,10 @@
#define MI_LRI_FORCE_POSTED REG_BIT(12)
#define MI_LRI_LEN(x) (((x) & 0xff) + 1)
+#define MI_STORE_REGISTER_MEM (__MI_INSTR(0x24) | XE_INSTR_NUM_DW(4))
+#define MI_SRM_USE_GGTT REG_BIT(22)
+#define MI_SRM_ADD_CS_OFFSET REG_BIT(19)
+
#define MI_FLUSH_DW __MI_INSTR(0x26)
#define MI_FLUSH_DW_PROTECTED_MEM_EN REG_BIT(22)
#define MI_FLUSH_DW_STORE_INDEX REG_BIT(21)
diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
index fb8ec317b6ee..891f928d80ce 100644
--- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h
@@ -43,6 +43,10 @@
#define XEHPC_BCS8_RING_BASE 0x3ee000
#define GSCCS_RING_BASE 0x11a000
+#define ENGINE_ID(base) XE_REG((base) + 0x8c)
+#define ENGINE_INSTANCE_ID REG_GENMASK(9, 4)
+#define ENGINE_CLASS_ID REG_GENMASK(2, 0)
+
#define RING_TAIL(base) XE_REG((base) + 0x30)
#define TAIL_ADDR REG_GENMASK(20, 3)
@@ -154,6 +158,7 @@
#define STOP_RING REG_BIT(8)
#define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8)
+#define RING_CTX_TIMESTAMP_UDW(base) XE_REG((base) + 0x3ac)
#define CSBE_DEBUG_STATUS(base) XE_REG((base) + 0x3fc)
#define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4)
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index da1f198ac107..181913967ac9 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -157,6 +157,7 @@
#define XEHPG_SC_INSTDONE_EXTRA2 XE_REG_MCR(0x7108)
#define COMMON_SLICE_CHICKEN4 XE_REG(0x7300, XE_REG_OPTION_MASKED)
+#define SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE REG_BIT(12)
#define DISABLE_TDC_LOAD_BALANCING_CALC REG_BIT(6)
#define COMMON_SLICE_CHICKEN3 XE_REG(0x7304, XE_REG_OPTION_MASKED)
diff --git a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
index 57944f90bbf6..994af591a2e8 100644
--- a/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
+++ b/drivers/gpu/drm/xe/regs/xe_lrc_layout.h
@@ -11,7 +11,9 @@
#define CTX_RING_TAIL (0x06 + 1)
#define CTX_RING_START (0x08 + 1)
#define CTX_RING_CTL (0x0a + 1)
+#define CTX_BB_PER_CTX_PTR (0x12 + 1)
#define CTX_TIMESTAMP (0x22 + 1)
+#define CTX_TIMESTAMP_UDW (0x24 + 1)
#define CTX_INDIRECT_RING_STATE (0x26 + 1)
#define CTX_PDP0_UDW (0x30 + 1)
#define CTX_PDP0_LDW (0x32 + 1)
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 9f8667ebba85..0482f26aa480 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -330,6 +330,8 @@ struct xe_device {
u8 has_sriov:1;
/** @info.has_usm: Device has unified shared memory support */
u8 has_usm:1;
+ /** @info.has_64bit_timestamp: Device supports 64-bit timestamps */
+ u8 has_64bit_timestamp:1;
/** @info.is_dgfx: is discrete device */
u8 is_dgfx:1;
/**
diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
index 606922d9dd73..cd9b1c32f30f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue.c
+++ b/drivers/gpu/drm/xe/xe_exec_queue.c
@@ -830,7 +830,7 @@ void xe_exec_queue_update_run_ticks(struct xe_exec_queue *q)
{
struct xe_device *xe = gt_to_xe(q->gt);
struct xe_lrc *lrc;
- u32 old_ts, new_ts;
+ u64 old_ts, new_ts;
int idx;
/*
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 31bc2022bfc2..769781d577df 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -941,7 +941,7 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job)
return xe_sched_invalidate_job(job, 2);
}
- ctx_timestamp = xe_lrc_ctx_timestamp(q->lrc[0]);
+ ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(q->lrc[0]));
ctx_job_timestamp = xe_lrc_ctx_job_timestamp(q->lrc[0]);
/*
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index df3ceddede07..03bfba696b37 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -24,6 +24,7 @@
#include "xe_hw_fence.h"
#include "xe_map.h"
#include "xe_memirq.h"
+#include "xe_mmio.h"
#include "xe_sriov.h"
#include "xe_trace_lrc.h"
#include "xe_vm.h"
@@ -650,6 +651,7 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
+#define LRC_ENGINE_ID_PPHWSP_OFFSET 2096
#define LRC_PPHWSP_SIZE SZ_4K
u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
@@ -684,7 +686,7 @@ static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
{
- /* The start seqno is stored in the driver-defined portion of PPHWSP */
+ /* This is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
}
@@ -694,11 +696,21 @@ static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
}
+static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
+{
+ return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
+}
+
static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
{
return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
}
+static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
+{
+ return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
+}
+
static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
{
/* Indirect ring state page is at the very end of LRC */
@@ -726,8 +738,10 @@ DECL_MAP_ADDR_HELPERS(regs)
DECL_MAP_ADDR_HELPERS(start_seqno)
DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
DECL_MAP_ADDR_HELPERS(ctx_timestamp)
+DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
DECL_MAP_ADDR_HELPERS(parallel)
DECL_MAP_ADDR_HELPERS(indirect_ring)
+DECL_MAP_ADDR_HELPERS(engine_id)
#undef DECL_MAP_ADDR_HELPERS
@@ -743,18 +757,37 @@ u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
}
/**
+ * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp udw GGTT address
+ */
+u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
+}
+
+/**
* xe_lrc_ctx_timestamp() - Read ctx timestamp value
* @lrc: Pointer to the lrc.
*
* Returns: ctx timestamp value
*/
-u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
+u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
{
struct xe_device *xe = lrc_to_xe(lrc);
struct iosys_map map;
+ u32 ldw, udw = 0;
map = __xe_lrc_ctx_timestamp_map(lrc);
- return xe_map_read32(xe, &map);
+ ldw = xe_map_read32(xe, &map);
+
+ if (xe->info.has_64bit_timestamp) {
+ map = __xe_lrc_ctx_timestamp_udw_map(lrc);
+ udw = xe_map_read32(xe, &map);
+ }
+
+ return (u64)udw << 32 | ldw;
}
/**
@@ -864,7 +897,7 @@ static void *empty_lrc_data(struct xe_hw_engine *hwe)
static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
{
- u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
+ u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
@@ -877,6 +910,65 @@ static void xe_lrc_finish(struct xe_lrc *lrc)
xe_bo_unpin(lrc->bo);
xe_bo_unlock(lrc->bo);
xe_bo_put(lrc->bo);
+ xe_bo_unpin_map_no_vm(lrc->bb_per_ctx_bo);
+}
+
+/*
+ * xe_lrc_setup_utilization() - Setup wa bb to assist in calculating active
+ * context run ticks.
+ * @lrc: Pointer to the lrc.
+ *
+ * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
+ * context, but only gets updated when the context switches out. In order to
+ * check how long a context has been active before it switches out, two things
+ * are required:
+ *
+ * (1) Determine if the context is running:
+ * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
+ * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
+ * initialized. During a query, we just check for this value to determine if the
+ * context is active. If the context switched out, it would overwrite this
+ * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
+ * the last part of context restore, so reusing this LRC location will not
+ * clobber anything.
+ *
+ * (2) Calculate the time that the context has been active for:
+ * The CTX_TIMESTAMP ticks only when the context is active. If a context is
+ * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
+ * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
+ * engine instance. Since we do not know which instance the context is running
+ * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
+ * store it in the PPHSWP.
+ */
+#define CONTEXT_ACTIVE 1ULL
+static void xe_lrc_setup_utilization(struct xe_lrc *lrc)
+{
+ u32 *cmd;
+
+ cmd = lrc->bb_per_ctx_bo->vmap.vaddr;
+
+ *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
+ *cmd++ = ENGINE_ID(0).addr;
+ *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
+ *cmd++ = 0;
+
+ *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
+ *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+ *cmd++ = 0;
+ *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
+
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
+ *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
+ *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
+ *cmd++ = 0;
+ *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
+ }
+
+ *cmd++ = MI_BATCH_BUFFER_END;
+
+ xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
+ xe_bo_ggtt_addr(lrc->bb_per_ctx_bo) | 1);
+
}
#define PVC_CTX_ASID (0x2e + 1)
@@ -893,31 +985,40 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
void *init_data = NULL;
u32 arb_enable;
u32 lrc_size;
+ u32 bo_flags;
int err;
kref_init(&lrc->refcount);
+ lrc->gt = gt;
lrc->flags = 0;
lrc_size = ring_size + xe_gt_lrc_size(gt, hwe->class);
if (xe_gt_has_indirect_ring_state(gt))
lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
+ bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
+ XE_BO_FLAG_GGTT_INVALIDATE;
+
/*
* FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
* via VM bind calls.
*/
lrc->bo = xe_bo_create_pin_map(xe, tile, vm, lrc_size,
ttm_bo_type_kernel,
- XE_BO_FLAG_VRAM_IF_DGFX(tile) |
- XE_BO_FLAG_GGTT |
- XE_BO_FLAG_GGTT_INVALIDATE);
+ bo_flags);
if (IS_ERR(lrc->bo))
return PTR_ERR(lrc->bo);
+ lrc->bb_per_ctx_bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4K,
+ ttm_bo_type_kernel,
+ bo_flags);
+ if (IS_ERR(lrc->bb_per_ctx_bo)) {
+ err = PTR_ERR(lrc->bb_per_ctx_bo);
+ goto err_lrc_finish;
+ }
+
lrc->size = lrc_size;
- lrc->tile = gt_to_tile(hwe->gt);
lrc->ring.size = ring_size;
lrc->ring.tail = 0;
- lrc->ctx_timestamp = 0;
xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
hwe->fence_irq, hwe->name);
@@ -990,7 +1091,10 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
_MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
+ lrc->ctx_timestamp = 0;
xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
+ xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
if (xe->info.has_asid && vm)
xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
@@ -1019,6 +1123,8 @@ static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
map = __xe_lrc_start_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
+ xe_lrc_setup_utilization(lrc);
+
return 0;
err_lrc_finish:
@@ -1238,6 +1344,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
return __xe_lrc_parallel_map(lrc);
}
+/**
+ * xe_lrc_engine_id() - Read engine id value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: context id value
+ */
+static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_engine_id_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
static int instr_dw(u32 cmd_header)
{
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
@@ -1684,7 +1805,7 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
snapshot->lrc_snapshot = NULL;
- snapshot->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
+ snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
return snapshot;
}
@@ -1784,22 +1905,74 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
kfree(snapshot);
}
+static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
+{
+ u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
+ u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
+ struct xe_hw_engine *hwe;
+ u64 val;
+
+ hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
+ if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
+ "Unexpected engine class:instance %d:%d for context utilization\n",
+ class, instance))
+ return -1;
+
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
+ val = xe_mmio_read64_2x32(&hwe->gt->mmio,
+ RING_CTX_TIMESTAMP(hwe->mmio_base));
+ else
+ val = xe_mmio_read32(&hwe->gt->mmio,
+ RING_CTX_TIMESTAMP(hwe->mmio_base));
+
+ *reg_ctx_ts = val;
+
+ return 0;
+}
+
/**
* xe_lrc_update_timestamp() - Update ctx timestamp
* @lrc: Pointer to the lrc.
* @old_ts: Old timestamp value
*
* Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
- * update saved value.
+ * update saved value. With support for active contexts, the calculation may be
+ * slightly racy, so follow a read-again logic to ensure that the context is
+ * still active before returning the right timestamp.
*
* Returns: New ctx timestamp value
*/
-u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts)
+u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
{
+ u64 lrc_ts, reg_ts;
+ u32 engine_id;
+
*old_ts = lrc->ctx_timestamp;
- lrc->ctx_timestamp = xe_lrc_ctx_timestamp(lrc);
+ lrc_ts = xe_lrc_ctx_timestamp(lrc);
+ /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
+ if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
+ lrc->ctx_timestamp = lrc_ts;
+ goto done;
+ }
+
+ if (lrc_ts == CONTEXT_ACTIVE) {
+ engine_id = xe_lrc_engine_id(lrc);
+ if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
+ lrc->ctx_timestamp = reg_ts;
+
+ /* read lrc again to ensure context is still active */
+ lrc_ts = xe_lrc_ctx_timestamp(lrc);
+ }
+
+ /*
+ * If context switched out, just use the lrc_ts. Note that this needs to
+ * be a separate if condition.
+ */
+ if (lrc_ts != CONTEXT_ACTIVE)
+ lrc->ctx_timestamp = lrc_ts;
+done:
trace_xe_lrc_update_timestamp(lrc, *old_ts);
return lrc->ctx_timestamp;
diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h
index 0b40f349ab95..eb6e8de8c939 100644
--- a/drivers/gpu/drm/xe/xe_lrc.h
+++ b/drivers/gpu/drm/xe/xe_lrc.h
@@ -120,7 +120,8 @@ void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer
void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot);
u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc);
-u32 xe_lrc_ctx_timestamp(struct xe_lrc *lrc);
+u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc);
+u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc);
u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc);
u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc);
@@ -136,6 +137,6 @@ u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc);
*
* Returns the current LRC timestamp
*/
-u32 xe_lrc_update_timestamp(struct xe_lrc *lrc, u32 *old_ts);
+u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts);
#endif
diff --git a/drivers/gpu/drm/xe/xe_lrc_types.h b/drivers/gpu/drm/xe/xe_lrc_types.h
index 71ecb453f811..ae24cf6f8dd9 100644
--- a/drivers/gpu/drm/xe/xe_lrc_types.h
+++ b/drivers/gpu/drm/xe/xe_lrc_types.h
@@ -25,8 +25,8 @@ struct xe_lrc {
/** @size: size of lrc including any indirect ring state page */
u32 size;
- /** @tile: tile which this LRC belongs to */
- struct xe_tile *tile;
+ /** @gt: gt which this LRC belongs to */
+ struct xe_gt *gt;
/** @flags: LRC flags */
#define XE_LRC_FLAG_INDIRECT_RING_STATE 0x1
@@ -52,7 +52,10 @@ struct xe_lrc {
struct xe_hw_fence_ctx fence_ctx;
/** @ctx_timestamp: readout value of CTX_TIMESTAMP on last update */
- u32 ctx_timestamp;
+ u64 ctx_timestamp;
+
+ /** @bb_per_ctx_bo: buffer object for per context batch wa buffer */
+ struct xe_bo *bb_per_ctx_bo;
};
struct xe_lrc_snapshot;
diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
index 70a36e777546..46301f341773 100644
--- a/drivers/gpu/drm/xe/xe_mmio.c
+++ b/drivers/gpu/drm/xe/xe_mmio.c
@@ -75,12 +75,12 @@ static void mmio_multi_tile_setup(struct xe_device *xe, size_t tile_mmio_size)
* is fine as it's going to the root tile's mmio, that's
* guaranteed to be initialized earlier in xe_mmio_probe_early()
*/
- mtcfg = xe_mmio_read64_2x32(mmio, XEHP_MTCFG_ADDR);
+ mtcfg = xe_mmio_read32(mmio, XEHP_MTCFG_ADDR);
tile_count = REG_FIELD_GET(TILE_COUNT, mtcfg) + 1;
if (tile_count < xe->info.tile_count) {
drm_info(&xe->drm, "tile_count: %d, reduced_tile_count %d\n",
- xe->info.tile_count, tile_count);
+ xe->info.tile_count, tile_count);
xe->info.tile_count = tile_count;
/*
@@ -128,7 +128,7 @@ int xe_mmio_probe_early(struct xe_device *xe)
*/
xe->mmio.size = pci_resource_len(pdev, GTTMMADR_BAR);
xe->mmio.regs = pci_iomap(pdev, GTTMMADR_BAR, 0);
- if (xe->mmio.regs == NULL) {
+ if (!xe->mmio.regs) {
drm_err(&xe->drm, "failed to map registers\n");
return -EIO;
}
@@ -309,8 +309,8 @@ u64 xe_mmio_read64_2x32(struct xe_mmio *mmio, struct xe_reg reg)
return (u64)udw << 32 | ldw;
}
-static int __xe_mmio_wait32(struct xe_mmio *mmio, struct xe_reg reg, u32 mask, u32 val, u32 timeout_us,
- u32 *out_val, bool atomic, bool expect_match)
+static int __xe_mmio_wait32(struct xe_mmio *mmio, struct xe_reg reg, u32 mask, u32 val,
+ u32 timeout_us, u32 *out_val, bool atomic, bool expect_match)
{
ktime_t cur = ktime_get_raw();
const ktime_t end = ktime_add_us(cur, timeout_us);
diff --git a/drivers/gpu/drm/xe/xe_mocs.c b/drivers/gpu/drm/xe/xe_mocs.c
index 31dade91a089..0c737413fcb6 100644
--- a/drivers/gpu/drm/xe/xe_mocs.c
+++ b/drivers/gpu/drm/xe/xe_mocs.c
@@ -775,22 +775,23 @@ void xe_mocs_init(struct xe_gt *gt)
void xe_mocs_dump(struct xe_gt *gt, struct drm_printer *p)
{
struct xe_device *xe = gt_to_xe(gt);
+ enum xe_force_wake_domains domain;
struct xe_mocs_info table;
unsigned int fw_ref, flags;
flags = get_mocs_settings(xe, &table);
+ domain = flags & HAS_LNCF_MOCS ? XE_FORCEWAKE_ALL : XE_FW_GT;
xe_pm_runtime_get_noresume(xe);
- fw_ref = xe_force_wake_get(gt_to_fw(gt),
- flags & HAS_LNCF_MOCS ?
- XE_FORCEWAKE_ALL : XE_FW_GT);
- if (!fw_ref)
+ fw_ref = xe_force_wake_get(gt_to_fw(gt), domain);
+
+ if (!xe_force_wake_ref_has_domain(fw_ref, domain))
goto err_fw;
table.ops->dump(&table, flags, gt, p);
- xe_force_wake_put(gt_to_fw(gt), fw_ref);
err_fw:
+ xe_force_wake_put(gt_to_fw(gt), fw_ref);
xe_pm_runtime_put(xe);
}
diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c
index 9f4632e39a1a..e861c694f336 100644
--- a/drivers/gpu/drm/xe/xe_module.c
+++ b/drivers/gpu/drm/xe/xe_module.c
@@ -29,9 +29,6 @@ struct xe_modparam xe_modparam = {
module_param_named(svm_notifier_size, xe_modparam.svm_notifier_size, uint, 0600);
MODULE_PARM_DESC(svm_notifier_size, "Set the svm notifier size(in MiB), must be power of 2");
-module_param_named(always_migrate_to_vram, xe_modparam.always_migrate_to_vram, bool, 0444);
-MODULE_PARM_DESC(always_migrate_to_vram, "Always migrate to VRAM on GPU fault");
-
module_param_named_unsafe(force_execlist, xe_modparam.force_execlist, bool, 0444);
MODULE_PARM_DESC(force_execlist, "Force Execlist submission");
diff --git a/drivers/gpu/drm/xe/xe_module.h b/drivers/gpu/drm/xe/xe_module.h
index 84339e509c80..5a3bfea8b7b4 100644
--- a/drivers/gpu/drm/xe/xe_module.h
+++ b/drivers/gpu/drm/xe/xe_module.h
@@ -12,7 +12,6 @@
struct xe_modparam {
bool force_execlist;
bool probe_display;
- bool always_migrate_to_vram;
u32 force_vram_bar_size;
int guc_log_level;
char *guc_firmware_path;
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 818f023166d5..f4d108dc49b1 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -140,6 +140,7 @@ static const struct xe_graphics_desc graphics_xelpg = {
.has_indirect_ring_state = 1, \
.has_range_tlb_invalidation = 1, \
.has_usm = 1, \
+ .has_64bit_timestamp = 1, \
.va_bits = 48, \
.vm_max_level = 4, \
.hw_engine_mask = \
@@ -668,6 +669,7 @@ static int xe_info_init(struct xe_device *xe,
xe->info.has_range_tlb_invalidation = graphics_desc->has_range_tlb_invalidation;
xe->info.has_usm = graphics_desc->has_usm;
+ xe->info.has_64bit_timestamp = graphics_desc->has_64bit_timestamp;
for_each_remote_tile(tile, xe, id) {
int err;
diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
index e9b9bbc138d3..ca6b10d35573 100644
--- a/drivers/gpu/drm/xe/xe_pci_types.h
+++ b/drivers/gpu/drm/xe/xe_pci_types.h
@@ -21,6 +21,7 @@ struct xe_graphics_desc {
u8 has_indirect_ring_state:1;
u8 has_range_tlb_invalidation:1;
u8 has_usm:1;
+ u8 has_64bit_timestamp:1;
};
struct xe_media_desc {
diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
index ffaf0d02dc7d..856038553b81 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -2232,11 +2232,19 @@ static void op_commit(struct xe_vm *vm,
}
case DRM_GPUVA_OP_DRIVER:
{
+ /* WRITE_ONCE pairs with READ_ONCE in xe_svm.c */
+
if (op->subop == XE_VMA_SUBOP_MAP_RANGE) {
- op->map_range.range->tile_present |= BIT(tile->id);
- op->map_range.range->tile_invalidated &= ~BIT(tile->id);
+ WRITE_ONCE(op->map_range.range->tile_present,
+ op->map_range.range->tile_present |
+ BIT(tile->id));
+ WRITE_ONCE(op->map_range.range->tile_invalidated,
+ op->map_range.range->tile_invalidated &
+ ~BIT(tile->id));
} else if (op->subop == XE_VMA_SUBOP_UNMAP_RANGE) {
- op->unmap_range.range->tile_present &= ~BIT(tile->id);
+ WRITE_ONCE(op->unmap_range.range->tile_present,
+ op->unmap_range.range->tile_present &
+ ~BIT(tile->id));
}
break;
}
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c
index a7582b097ae6..bc1689db4cd7 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -234,13 +234,10 @@ static u32 get_ppgtt_flag(struct xe_sched_job *job)
static int emit_copy_timestamp(struct xe_lrc *lrc, u32 *dw, int i)
{
- dw[i++] = MI_COPY_MEM_MEM | MI_COPY_MEM_MEM_SRC_GGTT |
- MI_COPY_MEM_MEM_DST_GGTT;
+ dw[i++] = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
+ dw[i++] = RING_CTX_TIMESTAMP(0).addr;
dw[i++] = xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
dw[i++] = 0;
- dw[i++] = xe_lrc_ctx_timestamp_ggtt_addr(lrc);
- dw[i++] = 0;
- dw[i++] = MI_NOOP;
return i;
}
diff --git a/drivers/gpu/drm/xe/xe_shrinker.c b/drivers/gpu/drm/xe/xe_shrinker.c
index 8184390f9c7b..86d47aaf0358 100644
--- a/drivers/gpu/drm/xe/xe_shrinker.c
+++ b/drivers/gpu/drm/xe/xe_shrinker.c
@@ -227,7 +227,7 @@ struct xe_shrinker *xe_shrinker_create(struct xe_device *xe)
if (!shrinker)
return ERR_PTR(-ENOMEM);
- shrinker->shrink = shrinker_alloc(0, "xe system shrinker");
+ shrinker->shrink = shrinker_alloc(0, "drm-xe_gem:%s", xe->drm.unique);
if (!shrinker->shrink) {
kfree(shrinker);
return ERR_PTR(-ENOMEM);
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 24c578e1170e..975094c1a582 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -15,8 +15,17 @@
static bool xe_svm_range_in_vram(struct xe_svm_range *range)
{
- /* Not reliable without notifier lock */
- return range->base.flags.has_devmem_pages;
+ /*
+ * Advisory only check whether the range is currently backed by VRAM
+ * memory.
+ */
+
+ struct drm_gpusvm_range_flags flags = {
+ /* Pairs with WRITE_ONCE in drm_gpusvm.c */
+ .__flags = READ_ONCE(range->base.flags.__flags),
+ };
+
+ return flags.has_devmem_pages;
}
static bool xe_svm_range_has_vram_binding(struct xe_svm_range *range)
@@ -645,9 +654,16 @@ void xe_svm_fini(struct xe_vm *vm)
}
static bool xe_svm_range_is_valid(struct xe_svm_range *range,
- struct xe_tile *tile)
+ struct xe_tile *tile,
+ bool devmem_only)
{
- return (range->tile_present & ~range->tile_invalidated) & BIT(tile->id);
+ /*
+ * Advisory only check whether the range currently has a valid mapping,
+ * READ_ONCE pairs with WRITE_ONCE in xe_pt.c
+ */
+ return ((READ_ONCE(range->tile_present) &
+ ~READ_ONCE(range->tile_invalidated)) & BIT(tile->id)) &&
+ (!devmem_only || xe_svm_range_in_vram(range));
}
static struct xe_vram_region *tile_to_vr(struct xe_tile *tile)
@@ -712,6 +728,36 @@ unlock:
return err;
}
+static bool supports_4K_migration(struct xe_device *xe)
+{
+ if (xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
+ return false;
+
+ return true;
+}
+
+static bool xe_svm_range_needs_migrate_to_vram(struct xe_svm_range *range,
+ struct xe_vma *vma)
+{
+ struct xe_vm *vm = range_to_vm(&range->base);
+ u64 range_size = xe_svm_range_size(range);
+
+ if (!range->base.flags.migrate_devmem)
+ return false;
+
+ if (xe_svm_range_in_vram(range)) {
+ drm_dbg(&vm->xe->drm, "Range is already in VRAM\n");
+ return false;
+ }
+
+ if (range_size <= SZ_64K && !supports_4K_migration(vm->xe)) {
+ drm_dbg(&vm->xe->drm, "Platform doesn't support SZ_4K range migration\n");
+ return false;
+ }
+
+ return true;
+}
+
/**
* xe_svm_handle_pagefault() - SVM handle page fault
* @vm: The VM.
@@ -735,11 +781,16 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
.check_pages_threshold = IS_DGFX(vm->xe) &&
IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
+ .devmem_only = atomic && IS_DGFX(vm->xe) &&
+ IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
+ .timeslice_ms = atomic && IS_DGFX(vm->xe) &&
+ IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? 5 : 0,
};
struct xe_svm_range *range;
struct drm_gpusvm_range *r;
struct drm_exec exec;
struct dma_fence *fence;
+ int migrate_try_count = ctx.devmem_only ? 3 : 1;
ktime_t end = 0;
int err;
@@ -758,24 +809,31 @@ retry:
if (IS_ERR(r))
return PTR_ERR(r);
+ if (ctx.devmem_only && !r->flags.migrate_devmem)
+ return -EACCES;
+
range = to_xe_range(r);
- if (xe_svm_range_is_valid(range, tile))
+ if (xe_svm_range_is_valid(range, tile, ctx.devmem_only))
return 0;
range_debug(range, "PAGE FAULT");
- /* XXX: Add migration policy, for now migrate range once */
- if (!range->skip_migrate && range->base.flags.migrate_devmem &&
- xe_svm_range_size(range) >= SZ_64K) {
- range->skip_migrate = true;
-
+ if (--migrate_try_count >= 0 &&
+ xe_svm_range_needs_migrate_to_vram(range, vma)) {
err = xe_svm_alloc_vram(vm, tile, range, &ctx);
+ ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */
if (err) {
- drm_dbg(&vm->xe->drm,
- "VRAM allocation failed, falling back to "
- "retrying fault, asid=%u, errno=%pe\n",
- vm->usm.asid, ERR_PTR(err));
- goto retry;
+ if (migrate_try_count || !ctx.devmem_only) {
+ drm_dbg(&vm->xe->drm,
+ "VRAM allocation failed, falling back to retrying fault, asid=%u, errno=%pe\n",
+ vm->usm.asid, ERR_PTR(err));
+ goto retry;
+ } else {
+ drm_err(&vm->xe->drm,
+ "VRAM allocation failed, retry count exceeded, asid=%u, errno=%pe\n",
+ vm->usm.asid, ERR_PTR(err));
+ return err;
+ }
}
}
@@ -783,15 +841,23 @@ retry:
err = drm_gpusvm_range_get_pages(&vm->svm.gpusvm, r, &ctx);
/* Corner where CPU mappings have changed */
if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {
- if (err == -EOPNOTSUPP) {
- range_debug(range, "PAGE FAULT - EVICT PAGES");
- drm_gpusvm_range_evict(&vm->svm.gpusvm, &range->base);
+ ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */
+ if (migrate_try_count > 0 || !ctx.devmem_only) {
+ if (err == -EOPNOTSUPP) {
+ range_debug(range, "PAGE FAULT - EVICT PAGES");
+ drm_gpusvm_range_evict(&vm->svm.gpusvm,
+ &range->base);
+ }
+ drm_dbg(&vm->xe->drm,
+ "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
+ vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
+ range_debug(range, "PAGE FAULT - RETRY PAGES");
+ goto retry;
+ } else {
+ drm_err(&vm->xe->drm,
+ "Get pages failed, retry count exceeded, asid=%u, gpusvm=%p, errno=%pe\n",
+ vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
}
- drm_dbg(&vm->xe->drm,
- "Get pages failed, falling back to retrying, asid=%u, gpusvm=%p, errno=%pe\n",
- vm->usm.asid, &vm->svm.gpusvm, ERR_PTR(err));
- range_debug(range, "PAGE FAULT - RETRY PAGES");
- goto retry;
}
if (err) {
range_debug(range, "PAGE FAULT - FAIL PAGE COLLECT");
@@ -815,6 +881,7 @@ retry_bind:
drm_exec_fini(&exec);
err = PTR_ERR(fence);
if (err == -EAGAIN) {
+ ctx.timeslice_ms <<= 1; /* Double timeslice if we have to retry */
range_debug(range, "PAGE FAULT - RETRY BIND");
goto retry;
}
@@ -825,9 +892,6 @@ retry_bind:
}
drm_exec_fini(&exec);
- if (xe_modparam.always_migrate_to_vram)
- range->skip_migrate = false;
-
dma_fence_wait(fence, false);
dma_fence_put(fence);
diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
index be306fe7aaa4..fe58ac2f4baa 100644
--- a/drivers/gpu/drm/xe/xe_svm.h
+++ b/drivers/gpu/drm/xe/xe_svm.h
@@ -36,11 +36,6 @@ struct xe_svm_range {
* range. Protected by GPU SVM notifier lock.
*/
u8 tile_invalidated;
- /**
- * @skip_migrate: Skip migration to VRAM, protected by GPU fault handler
- * locking.
- */
- u8 skip_migrate :1;
};
#if IS_ENABLED(CONFIG_DRM_GPUSVM)
diff --git a/drivers/gpu/drm/xe/xe_trace_lrc.h b/drivers/gpu/drm/xe/xe_trace_lrc.h
index 5c669a0b2180..d525cbee1e34 100644
--- a/drivers/gpu/drm/xe/xe_trace_lrc.h
+++ b/drivers/gpu/drm/xe/xe_trace_lrc.h
@@ -19,12 +19,12 @@
#define __dev_name_lrc(lrc) dev_name(gt_to_xe((lrc)->fence_ctx.gt)->drm.dev)
TRACE_EVENT(xe_lrc_update_timestamp,
- TP_PROTO(struct xe_lrc *lrc, uint32_t old),
+ TP_PROTO(struct xe_lrc *lrc, uint64_t old),
TP_ARGS(lrc, old),
TP_STRUCT__entry(
__field(struct xe_lrc *, lrc)
- __field(u32, old)
- __field(u32, new)
+ __field(u64, old)
+ __field(u64, new)
__string(name, lrc->fence_ctx.name)
__string(device_id, __dev_name_lrc(lrc))
),
@@ -36,7 +36,7 @@ TRACE_EVENT(xe_lrc_update_timestamp,
__assign_str(name);
__assign_str(device_id);
),
- TP_printk("lrc=:%p lrc->name=%s old=%u new=%u device_id:%s",
+ TP_printk("lrc=:%p lrc->name=%s old=%llu new=%llu device_id:%s",
__entry->lrc, __get_str(name),
__entry->old, __entry->new,
__get_str(device_id))
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index 24f644c0a673..2f833f0d575f 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -815,6 +815,10 @@ static const struct xe_rtp_entry_sr lrc_was[] = {
XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)),
XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX))
},
+ { XE_RTP_NAME("22021007897"),
+ XE_RTP_RULES(GRAPHICS_VERSION(2001), ENGINE_CLASS(RENDER)),
+ XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE))
+ },
/* Xe3_LPG */
{ XE_RTP_NAME("14021490052"),
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
index 25f0ebfcbd5f..0a9b44ce4904 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
@@ -83,6 +83,9 @@ static int amd_sfh_hid_client_deinit(struct amd_mp2_dev *privdata)
case ALS_IDX:
privdata->dev_en.is_als_present = false;
break;
+ case SRA_IDX:
+ privdata->dev_en.is_sra_present = false;
+ break;
}
if (cl_data->sensor_sts[i] == SENSOR_ENABLED) {
@@ -134,9 +137,6 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
for (i = 0; i < cl_data->num_hid_devices; i++) {
cl_data->sensor_sts[i] = SENSOR_DISABLED;
- if (cl_data->num_hid_devices == 1 && cl_data->sensor_idx[0] == SRA_IDX)
- break;
-
if (cl_data->sensor_idx[i] == SRA_IDX) {
info.sensor_idx = cl_data->sensor_idx[i];
writel(0, privdata->mmio + amd_get_p2c_val(privdata, 0));
@@ -145,8 +145,10 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
(privdata, cl_data->sensor_idx[i], ENABLE_SENSOR);
cl_data->sensor_sts[i] = (status == 0) ? SENSOR_ENABLED : SENSOR_DISABLED;
- if (cl_data->sensor_sts[i] == SENSOR_ENABLED)
+ if (cl_data->sensor_sts[i] == SENSOR_ENABLED) {
+ cl_data->is_any_sensor_enabled = true;
privdata->dev_en.is_sra_present = true;
+ }
continue;
}
@@ -238,6 +240,8 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
cleanup:
amd_sfh_hid_client_deinit(privdata);
for (i = 0; i < cl_data->num_hid_devices; i++) {
+ if (cl_data->sensor_idx[i] == SRA_IDX)
+ continue;
devm_kfree(dev, cl_data->feature_report[i]);
devm_kfree(dev, in_data->input_report[i]);
devm_kfree(dev, cl_data->report_descr[i]);
diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 2e96ec6a3073..9a06f9b0e4ef 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -38,6 +38,9 @@ dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type
struct hid_bpf_ops *e;
int ret;
+ if (unlikely(hdev->bpf.destroyed))
+ return ERR_PTR(-ENODEV);
+
if (type >= HID_REPORT_TYPES)
return ERR_PTR(-EINVAL);
@@ -93,6 +96,9 @@ int dispatch_hid_bpf_raw_requests(struct hid_device *hdev,
struct hid_bpf_ops *e;
int ret, idx;
+ if (unlikely(hdev->bpf.destroyed))
+ return -ENODEV;
+
if (rtype >= HID_REPORT_TYPES)
return -EINVAL;
@@ -130,6 +136,9 @@ int dispatch_hid_bpf_output_report(struct hid_device *hdev,
struct hid_bpf_ops *e;
int ret, idx;
+ if (unlikely(hdev->bpf.destroyed))
+ return -ENODEV;
+
idx = srcu_read_lock(&hdev->bpf.srcu);
list_for_each_entry_srcu(e, &hdev->bpf.prog_list, list,
srcu_read_lock_held(&hdev->bpf.srcu)) {
diff --git a/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c b/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c
index 1a0aeea6a081..a754710fc90b 100644
--- a/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c
+++ b/drivers/hid/bpf/progs/XPPen__ACK05.bpf.c
@@ -157,6 +157,7 @@ static const __u8 fixed_rdesc_vendor[] = {
ReportCount(5) // padding
Input(Const)
// Byte 4 in report - just exists so we get to be a tablet pad
+ UsagePage_Digitizers
Usage_Dig_BarrelSwitch // BTN_STYLUS
ReportCount(1)
ReportSize(1)
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 288a2b864cc4..1062731315a2 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -41,6 +41,10 @@
#define USB_VENDOR_ID_ACTIONSTAR 0x2101
#define USB_DEVICE_ID_ACTIONSTAR_1011 0x1011
+#define USB_VENDOR_ID_ADATA_XPG 0x125f
+#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE 0x7505
+#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE 0x7506
+
#define USB_VENDOR_ID_ADS_TECH 0x06e1
#define USB_DEVICE_ID_ADS_TECH_RADIO_SI470X 0xa155
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index 646171598e41..0731473cc9b1 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -27,6 +27,8 @@
static const struct hid_device_id hid_quirks[] = {
{ HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_GAMEPAD), HID_QUIRK_BADPAD },
{ HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_PREDATOR), HID_QUIRK_BADPAD },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE), HID_QUIRK_ALWAYS_POLL },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE), HID_QUIRK_ALWAYS_POLL },
{ HID_USB_DEVICE(USB_VENDOR_ID_AFATECH, USB_DEVICE_ID_AFATECH_AF9016), HID_QUIRK_FULLSPEED_INTERVAL },
{ HID_USB_DEVICE(USB_VENDOR_ID_AIREN, USB_DEVICE_ID_AIREN_SLIMPLUS), HID_QUIRK_NOGET },
{ HID_USB_DEVICE(USB_VENDOR_ID_AKAI_09E8, USB_DEVICE_ID_AKAI_09E8_MIDIMIX), HID_QUIRK_NO_INIT_REPORTS },
diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c
index dfd9d22ed559..949d307c66a8 100644
--- a/drivers/hid/hid-steam.c
+++ b/drivers/hid/hid-steam.c
@@ -1150,11 +1150,9 @@ static void steam_client_ll_close(struct hid_device *hdev)
struct steam_device *steam = hdev->driver_data;
unsigned long flags;
- bool connected;
spin_lock_irqsave(&steam->lock, flags);
steam->client_opened--;
- connected = steam->connected && !steam->client_opened;
spin_unlock_irqrestore(&steam->lock, flags);
schedule_work(&steam->unregister_work);
diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c
index 3b81468a1df2..0bf70664c35e 100644
--- a/drivers/hid/hid-thrustmaster.c
+++ b/drivers/hid/hid-thrustmaster.c
@@ -174,6 +174,7 @@ static void thrustmaster_interrupts(struct hid_device *hdev)
u8 ep_addr[2] = {b_ep, 0};
if (!usb_check_int_endpoints(usbif, ep_addr)) {
+ kfree(send_buf);
hid_err(hdev, "Unexpected non-int endpoint\n");
return;
}
diff --git a/drivers/hid/hid-uclogic-core.c b/drivers/hid/hid-uclogic-core.c
index a367df6ea01f..61a4019ddc74 100644
--- a/drivers/hid/hid-uclogic-core.c
+++ b/drivers/hid/hid-uclogic-core.c
@@ -142,11 +142,12 @@ static int uclogic_input_configured(struct hid_device *hdev,
suffix = "System Control";
break;
}
- }
-
- if (suffix)
+ } else {
hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL,
"%s %s", hdev->name, suffix);
+ if (!hi->input->name)
+ return -ENOMEM;
+ }
return 0;
}
diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index 1556d4287fa5..eaf099b2efdb 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c
@@ -70,10 +70,16 @@ static void wacom_wac_queue_flush(struct hid_device *hdev,
{
while (!kfifo_is_empty(fifo)) {
int size = kfifo_peek_len(fifo);
- u8 *buf = kzalloc(size, GFP_KERNEL);
+ u8 *buf;
unsigned int count;
int err;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (!buf) {
+ kfifo_skip(fifo);
+ continue;
+ }
+
count = kfifo_out(fifo, buf, size);
if (count != size) {
// Hard to say what is the "right" action in this
@@ -81,6 +87,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev,
// to flush seems reasonable enough, however.
hid_warn(hdev, "%s: removed fifo entry with unexpected size\n",
__func__);
+ kfree(buf);
continue;
}
err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false);
@@ -2361,6 +2368,8 @@ static int wacom_parse_and_register(struct wacom *wacom, bool wireless)
unsigned int connect_mask = HID_CONNECT_HIDRAW;
features->pktlen = wacom_compute_pktlen(hdev);
+ if (!features->pktlen)
+ return -ENODEV;
if (!devres_open_group(&hdev->dev, wacom, GFP_KERNEL))
return -ENOMEM;
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fb8cd8469328..35f26fa1ffe7 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -1077,68 +1077,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
EXPORT_SYMBOL(vmbus_sendpacket);
/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type. This interface allows you
- * to control notifying the host. This will be useful for sending
- * batched data. Also the sender can control the send flags
- * explicitly.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
- struct hv_page_buffer pagebuffers[],
- u32 pagecount, void *buffer, u32 bufferlen,
- u64 requestid)
-{
- int i;
- struct vmbus_channel_packet_page_buffer desc;
- u32 descsize;
- u32 packetlen;
- u32 packetlen_aligned;
- struct kvec bufferlist[3];
- u64 aligned_data = 0;
-
- if (pagecount > MAX_PAGE_BUFFER_COUNT)
- return -EINVAL;
-
- /*
- * Adjust the size down since vmbus_channel_packet_page_buffer is the
- * largest size we support
- */
- descsize = sizeof(struct vmbus_channel_packet_page_buffer) -
- ((MAX_PAGE_BUFFER_COUNT - pagecount) *
- sizeof(struct hv_page_buffer));
- packetlen = descsize + bufferlen;
- packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
- /* Setup the descriptor */
- desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
- desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
- desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
- desc.length8 = (u16)(packetlen_aligned >> 3);
- desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
- desc.reserved = 0;
- desc.rangecount = pagecount;
-
- for (i = 0; i < pagecount; i++) {
- desc.range[i].len = pagebuffers[i].len;
- desc.range[i].offset = pagebuffers[i].offset;
- desc.range[i].pfn = pagebuffers[i].pfn;
- }
-
- bufferlist[0].iov_base = &desc;
- bufferlist[0].iov_len = descsize;
- bufferlist[1].iov_base = buffer;
- bufferlist[1].iov_len = bufferlen;
- bufferlist[2].iov_base = &aligned_data;
- bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
- return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
-
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets
* using a GPADL Direct packet type.
- * The buffer includes the vmbus descriptor.
+ * The desc argument must include space for the VMBus descriptor. The
+ * rangecount field must already be set.
*/
int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
struct vmbus_packet_mpb_array *desc,
@@ -1160,7 +1102,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
desc->length8 = (u16)(packetlen_aligned >> 3);
desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
desc->reserved = 0;
- desc->rangecount = 1;
bufferlist[0].iov_base = desc;
bufferlist[0].iov_len = desc_size;
diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c
index 8e0267c7cc29..f21f9877c040 100644
--- a/drivers/i2c/busses/i2c-designware-pcidrv.c
+++ b/drivers/i2c/busses/i2c-designware-pcidrv.c
@@ -278,9 +278,11 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev,
if ((dev->flags & MODEL_MASK) == MODEL_AMD_NAVI_GPU) {
dev->slave = i2c_new_ccgx_ucsi(&dev->adapter, dev->irq, &dgpu_node);
- if (IS_ERR(dev->slave))
+ if (IS_ERR(dev->slave)) {
+ i2c_del_adapter(&dev->adapter);
return dev_err_probe(device, PTR_ERR(dev->slave),
"register UCSI failed\n");
+ }
}
pm_runtime_set_autosuspend_delay(device, 1000);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b4e3e4beb7f4..d4263385850a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1352,6 +1352,9 @@ static void ib_device_notify_register(struct ib_device *device)
down_read(&devices_rwsem);
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
if (ret)
goto out;
@@ -1468,10 +1471,9 @@ int ib_register_device(struct ib_device *device, const char *name,
return ret;
}
dev_set_uevent_suppress(&device->dev, false);
- /* Mark for userspace that device is ready */
- kobject_uevent(&device->dev.kobj, KOBJ_ADD);
ib_device_notify_register(device);
+
ib_device_put(device);
return 0;
diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c
index 1ee8969595d3..7599e31b5743 100644
--- a/drivers/infiniband/hw/irdma/main.c
+++ b/drivers/infiniband/hw/irdma/main.c
@@ -221,7 +221,7 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf)
break;
if (i < IRDMA_MIN_MSIX) {
- for (; i > 0; i--)
+ while (--i >= 0)
ice_free_rdma_qvector(pf, &rf->msix_entries[i]);
kfree(rf->msix_entries);
@@ -255,6 +255,8 @@ static void irdma_remove(struct auxiliary_device *aux_dev)
ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false);
irdma_deinit_interrupts(iwdev->rf, pf);
+ kfree(iwdev->rf);
+
pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn));
}
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index eeb932e58730..1e8c92826de2 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -4871,5 +4871,4 @@ void irdma_ib_dealloc_device(struct ib_device *ibdev)
irdma_rt_deinit_hw(iwdev);
irdma_ctrl_deinit_hw(iwdev->rf);
- kfree(iwdev->rf);
}
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index b9f4a2937c3a..2098de762bf5 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -90,7 +90,7 @@ static int create_file(const char *name, umode_t mode,
int error;
inode_lock(d_inode(parent));
- *dentry = lookup_one_len(name, parent, strlen(name));
+ *dentry = lookup_noperm(&QSTR(name), parent);
if (!IS_ERR(*dentry))
error = qibfs_mknod(d_inode(parent), *dentry,
mode, fops, data);
@@ -433,7 +433,7 @@ static int remove_device_files(struct super_block *sb,
char unit[10];
snprintf(unit, sizeof(unit), "%u", dd->unit);
- dir = lookup_one_len_unlocked(unit, sb->s_root, strlen(unit));
+ dir = lookup_noperm_unlocked(&QSTR(unit), sb->s_root);
if (IS_ERR(dir)) {
pr_err("Lookup of %s failed\n", unit);
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
index fec87c9030ab..fffd144d509e 100644
--- a/drivers/infiniband/sw/rxe/rxe_cq.c
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata,
cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
- if (err) {
- vfree(cq->queue->buf);
- kfree(cq->queue);
+ if (err)
return err;
- }
cq->is_user = uresp;
diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c
index 57a5ff3d1992..1008858f78e2 100644
--- a/drivers/input/joystick/xpad.c
+++ b/drivers/input/joystick/xpad.c
@@ -290,6 +290,8 @@ static const struct xpad_device {
{ 0x1038, 0x1430, "SteelSeries Stratus Duo", 0, XTYPE_XBOX360 },
{ 0x1038, 0x1431, "SteelSeries Stratus Duo", 0, XTYPE_XBOX360 },
{ 0x10f5, 0x7005, "Turtle Beach Recon Controller", 0, XTYPE_XBOXONE },
+ { 0x10f5, 0x7008, "Turtle Beach Recon Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE },
+ { 0x10f5, 0x7073, "Turtle Beach Stealth Ultra Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE },
{ 0x11c9, 0x55f0, "Nacon GC-100XF", 0, XTYPE_XBOX360 },
{ 0x11ff, 0x0511, "PXN V900", 0, XTYPE_XBOX360 },
{ 0x1209, 0x2882, "Ardwiino Controller", 0, XTYPE_XBOX360 },
@@ -354,6 +356,7 @@ static const struct xpad_device {
{ 0x1ee9, 0x1590, "ZOTAC Gaming Zone", 0, XTYPE_XBOX360 },
{ 0x20d6, 0x2001, "BDA Xbox Series X Wired Controller", 0, XTYPE_XBOXONE },
{ 0x20d6, 0x2009, "PowerA Enhanced Wired Controller for Xbox Series X|S", 0, XTYPE_XBOXONE },
+ { 0x20d6, 0x2064, "PowerA Wired Controller for Xbox", MAP_SHARE_BUTTON, XTYPE_XBOXONE },
{ 0x20d6, 0x281f, "PowerA Wired Controller For Xbox 360", 0, XTYPE_XBOX360 },
{ 0x20d6, 0x400b, "PowerA FUSION Pro 4 Wired Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE },
{ 0x20d6, 0x890b, "PowerA MOGA XP-Ultra Controller", MAP_SHARE_BUTTON, XTYPE_XBOXONE },
diff --git a/drivers/input/rmi4/rmi_f34.c b/drivers/input/rmi4/rmi_f34.c
index d760af4cc12e..f1947f03b06a 100644
--- a/drivers/input/rmi4/rmi_f34.c
+++ b/drivers/input/rmi4/rmi_f34.c
@@ -4,6 +4,7 @@
* Copyright (C) 2016 Zodiac Inflight Innovations
*/
+#include "linux/device.h"
#include <linux/kernel.h>
#include <linux/rmi.h>
#include <linux/firmware.h>
@@ -289,39 +290,30 @@ static int rmi_f34_update_firmware(struct f34_data *f34,
return rmi_f34_flash_firmware(f34, syn_fw);
}
-static int rmi_f34_status(struct rmi_function *fn)
-{
- struct f34_data *f34 = dev_get_drvdata(&fn->dev);
-
- /*
- * The status is the percentage complete, or once complete,
- * zero for success or a negative return code.
- */
- return f34->update_status;
-}
-
static ssize_t rmi_driver_bootloader_id_show(struct device *dev,
struct device_attribute *dattr,
char *buf)
{
struct rmi_driver_data *data = dev_get_drvdata(dev);
- struct rmi_function *fn = data->f34_container;
+ struct rmi_function *fn;
struct f34_data *f34;
- if (fn) {
- f34 = dev_get_drvdata(&fn->dev);
-
- if (f34->bl_version == 5)
- return sysfs_emit(buf, "%c%c\n",
- f34->bootloader_id[0],
- f34->bootloader_id[1]);
- else
- return sysfs_emit(buf, "V%d.%d\n",
- f34->bootloader_id[1],
- f34->bootloader_id[0]);
- }
+ fn = data->f34_container;
+ if (!fn)
+ return -ENODEV;
- return 0;
+ f34 = dev_get_drvdata(&fn->dev);
+ if (!f34)
+ return -ENODEV;
+
+ if (f34->bl_version == 5)
+ return sysfs_emit(buf, "%c%c\n",
+ f34->bootloader_id[0],
+ f34->bootloader_id[1]);
+ else
+ return sysfs_emit(buf, "V%d.%d\n",
+ f34->bootloader_id[1],
+ f34->bootloader_id[0]);
}
static DEVICE_ATTR(bootloader_id, 0444, rmi_driver_bootloader_id_show, NULL);
@@ -334,13 +326,16 @@ static ssize_t rmi_driver_configuration_id_show(struct device *dev,
struct rmi_function *fn = data->f34_container;
struct f34_data *f34;
- if (fn) {
- f34 = dev_get_drvdata(&fn->dev);
+ fn = data->f34_container;
+ if (!fn)
+ return -ENODEV;
+
+ f34 = dev_get_drvdata(&fn->dev);
+ if (!f34)
+ return -ENODEV;
- return sysfs_emit(buf, "%s\n", f34->configuration_id);
- }
- return 0;
+ return sysfs_emit(buf, "%s\n", f34->configuration_id);
}
static DEVICE_ATTR(configuration_id, 0444,
@@ -356,10 +351,14 @@ static int rmi_firmware_update(struct rmi_driver_data *data,
if (!data->f34_container) {
dev_warn(dev, "%s: No F34 present!\n", __func__);
- return -EINVAL;
+ return -ENODEV;
}
f34 = dev_get_drvdata(&data->f34_container->dev);
+ if (!f34) {
+ dev_warn(dev, "%s: No valid F34 present!\n", __func__);
+ return -ENODEV;
+ }
if (f34->bl_version >= 7) {
if (data->pdt_props & HAS_BSR) {
@@ -485,10 +484,18 @@ static ssize_t rmi_driver_update_fw_status_show(struct device *dev,
char *buf)
{
struct rmi_driver_data *data = dev_get_drvdata(dev);
- int update_status = 0;
+ struct f34_data *f34;
+ int update_status = -ENODEV;
- if (data->f34_container)
- update_status = rmi_f34_status(data->f34_container);
+ /*
+ * The status is the percentage complete, or once complete,
+ * zero for success or a negative return code.
+ */
+ if (data->f34_container) {
+ f34 = dev_get_drvdata(&data->f34_container->dev);
+ if (f34)
+ update_status = f34->update_status;
+ }
return sysfs_emit(buf, "%d\n", update_status);
}
@@ -508,33 +515,21 @@ static const struct attribute_group rmi_firmware_attr_group = {
.attrs = rmi_firmware_attrs,
};
-static int rmi_f34_probe(struct rmi_function *fn)
+static int rmi_f34v5_probe(struct f34_data *f34)
{
- struct f34_data *f34;
- unsigned char f34_queries[9];
+ struct rmi_function *fn = f34->fn;
+ u8 f34_queries[9];
bool has_config_id;
- u8 version = fn->fd.function_version;
- int ret;
-
- f34 = devm_kzalloc(&fn->dev, sizeof(struct f34_data), GFP_KERNEL);
- if (!f34)
- return -ENOMEM;
-
- f34->fn = fn;
- dev_set_drvdata(&fn->dev, f34);
-
- /* v5 code only supported version 0, try V7 probe */
- if (version > 0)
- return rmi_f34v7_probe(f34);
+ int error;
f34->bl_version = 5;
- ret = rmi_read_block(fn->rmi_dev, fn->fd.query_base_addr,
- f34_queries, sizeof(f34_queries));
- if (ret) {
+ error = rmi_read_block(fn->rmi_dev, fn->fd.query_base_addr,
+ f34_queries, sizeof(f34_queries));
+ if (error) {
dev_err(&fn->dev, "%s: Failed to query properties\n",
__func__);
- return ret;
+ return error;
}
snprintf(f34->bootloader_id, sizeof(f34->bootloader_id),
@@ -560,11 +555,11 @@ static int rmi_f34_probe(struct rmi_function *fn)
f34->v5.config_blocks);
if (has_config_id) {
- ret = rmi_read_block(fn->rmi_dev, fn->fd.control_base_addr,
- f34_queries, sizeof(f34_queries));
- if (ret) {
+ error = rmi_read_block(fn->rmi_dev, fn->fd.control_base_addr,
+ f34_queries, sizeof(f34_queries));
+ if (error) {
dev_err(&fn->dev, "Failed to read F34 config ID\n");
- return ret;
+ return error;
}
snprintf(f34->configuration_id, sizeof(f34->configuration_id),
@@ -573,12 +568,34 @@ static int rmi_f34_probe(struct rmi_function *fn)
f34_queries[2], f34_queries[3]);
rmi_dbg(RMI_DEBUG_FN, &fn->dev, "Configuration ID: %s\n",
- f34->configuration_id);
+ f34->configuration_id);
}
return 0;
}
+static int rmi_f34_probe(struct rmi_function *fn)
+{
+ struct f34_data *f34;
+ u8 version = fn->fd.function_version;
+ int error;
+
+ f34 = devm_kzalloc(&fn->dev, sizeof(struct f34_data), GFP_KERNEL);
+ if (!f34)
+ return -ENOMEM;
+
+ f34->fn = fn;
+
+ /* v5 code only supported version 0 */
+ error = version == 0 ? rmi_f34v5_probe(f34) : rmi_f34v7_probe(f34);
+ if (error)
+ return error;
+
+ dev_set_drvdata(&fn->dev, f34);
+
+ return 0;
+}
+
int rmi_f34_create_sysfs(struct rmi_device *rmi_dev)
{
return sysfs_create_group(&rmi_dev->dev.kobj, &rmi_firmware_attr_group);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 4f91a740c15f..9d728800a862 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3366,10 +3366,12 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
int ret;
for_each_group_device(group, device) {
- ret = domain->ops->set_dev_pasid(domain, device->dev,
- pasid, old);
- if (ret)
- goto err_revert;
+ if (device->dev->iommu->max_pasids > 0) {
+ ret = domain->ops->set_dev_pasid(domain, device->dev,
+ pasid, old);
+ if (ret)
+ goto err_revert;
+ }
}
return 0;
@@ -3379,15 +3381,18 @@ err_revert:
for_each_group_device(group, device) {
if (device == last_gdev)
break;
- /*
- * If no old domain, undo the succeeded devices/pasid.
- * Otherwise, rollback the succeeded devices/pasid to the old
- * domain. And it is a driver bug to fail attaching with a
- * previously good domain.
- */
- if (!old || WARN_ON(old->ops->set_dev_pasid(old, device->dev,
+ if (device->dev->iommu->max_pasids > 0) {
+ /*
+ * If no old domain, undo the succeeded devices/pasid.
+ * Otherwise, rollback the succeeded devices/pasid to
+ * the old domain. And it is a driver bug to fail
+ * attaching with a previously good domain.
+ */
+ if (!old ||
+ WARN_ON(old->ops->set_dev_pasid(old, device->dev,
pasid, domain)))
- iommu_remove_dev_pasid(device->dev, pasid, domain);
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
+ }
}
return ret;
}
@@ -3398,8 +3403,10 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
{
struct group_device *device;
- for_each_group_device(group, device)
- iommu_remove_dev_pasid(device->dev, pasid, domain);
+ for_each_group_device(group, device) {
+ if (device->dev->iommu->max_pasids > 0)
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
+ }
}
/*
@@ -3440,7 +3447,13 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
mutex_lock(&group->mutex);
for_each_group_device(group, device) {
- if (pasid >= device->dev->iommu->max_pasids) {
+ /*
+ * Skip PASID validation for devices without PASID support
+ * (max_pasids = 0). These devices cannot issue transactions
+ * with PASID, so they don't affect group's PASID usage.
+ */
+ if ((device->dev->iommu->max_pasids > 0) &&
+ (pasid >= device->dev->iommu->max_pasids)) {
ret = -EINVAL;
goto out_unlock;
}
diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index dc98c39d2b20..cc6a6c1585d2 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -252,7 +252,7 @@ static void __init gicv2m_teardown(void)
static struct msi_parent_ops gicv2m_msi_parent_ops = {
.supported_flags = GICV2M_MSI_FLAGS_SUPPORTED,
.required_flags = GICV2M_MSI_FLAGS_REQUIRED,
- .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
.bus_select_token = DOMAIN_BUS_NEXUS,
.bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI,
.prefix = "GICv2m-",
diff --git a/drivers/irqchip/irq-gic-v3-its-msi-parent.c b/drivers/irqchip/irq-gic-v3-its-msi-parent.c
index bdb04c808148..c5a7eb1c0419 100644
--- a/drivers/irqchip/irq-gic-v3-its-msi-parent.c
+++ b/drivers/irqchip/irq-gic-v3-its-msi-parent.c
@@ -203,7 +203,7 @@ static bool its_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
const struct msi_parent_ops gic_v3_its_msi_parent_ops = {
.supported_flags = ITS_MSI_FLAGS_SUPPORTED,
.required_flags = ITS_MSI_FLAGS_REQUIRED,
- .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
.bus_select_token = DOMAIN_BUS_NEXUS,
.bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI,
.prefix = "ITS-",
diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c
index 34e9ca77a8c3..647b18e24e0c 100644
--- a/drivers/irqchip/irq-gic-v3-mbi.c
+++ b/drivers/irqchip/irq-gic-v3-mbi.c
@@ -197,7 +197,7 @@ static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
static const struct msi_parent_ops gic_v3_mbi_msi_parent_ops = {
.supported_flags = MBI_MSI_FLAGS_SUPPORTED,
.required_flags = MBI_MSI_FLAGS_REQUIRED,
- .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
.bus_select_token = DOMAIN_BUS_NEXUS,
.bus_select_mask = MATCH_PCI_MSI | MATCH_PLATFORM_MSI,
.prefix = "MBI-",
diff --git a/drivers/irqchip/irq-mvebu-gicp.c b/drivers/irqchip/irq-mvebu-gicp.c
index d67f93f6d750..60b976286636 100644
--- a/drivers/irqchip/irq-mvebu-gicp.c
+++ b/drivers/irqchip/irq-mvebu-gicp.c
@@ -161,7 +161,7 @@ static const struct irq_domain_ops gicp_domain_ops = {
static const struct msi_parent_ops gicp_msi_parent_ops = {
.supported_flags = GICP_MSI_FLAGS_SUPPORTED,
.required_flags = GICP_MSI_FLAGS_REQUIRED,
- .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
.bus_select_token = DOMAIN_BUS_GENERIC_MSI,
.bus_select_mask = MATCH_PLATFORM_MSI,
.prefix = "GICP-",
diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c
index 28f7e81df94f..54f6f0811573 100644
--- a/drivers/irqchip/irq-mvebu-odmi.c
+++ b/drivers/irqchip/irq-mvebu-odmi.c
@@ -157,7 +157,7 @@ static const struct irq_domain_ops odmi_domain_ops = {
static const struct msi_parent_ops odmi_msi_parent_ops = {
.supported_flags = ODMI_MSI_FLAGS_SUPPORTED,
.required_flags = ODMI_MSI_FLAGS_REQUIRED,
- .chip_flags = MSI_CHIP_FLAG_SET_EOI | MSI_CHIP_FLAG_SET_ACK,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
.bus_select_token = DOMAIN_BUS_GENERIC_MSI,
.bus_select_mask = MATCH_PLATFORM_MSI,
.prefix = "ODMI-",
diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c
index bdf5cd2037f2..62f76950a113 100644
--- a/drivers/irqchip/irq-riscv-imsic-state.c
+++ b/drivers/irqchip/irq-riscv-imsic-state.c
@@ -208,17 +208,17 @@ skip:
}
#ifdef CONFIG_SMP
-static void __imsic_local_timer_start(struct imsic_local_priv *lpriv)
+static void __imsic_local_timer_start(struct imsic_local_priv *lpriv, unsigned int cpu)
{
lockdep_assert_held(&lpriv->lock);
if (!timer_pending(&lpriv->timer)) {
lpriv->timer.expires = jiffies + 1;
- add_timer_on(&lpriv->timer, smp_processor_id());
+ add_timer_on(&lpriv->timer, cpu);
}
}
#else
-static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv)
+static inline void __imsic_local_timer_start(struct imsic_local_priv *lpriv, unsigned int cpu)
{
}
#endif
@@ -233,7 +233,7 @@ void imsic_local_sync_all(bool force_all)
if (force_all)
bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1);
if (!__imsic_local_sync(lpriv))
- __imsic_local_timer_start(lpriv);
+ __imsic_local_timer_start(lpriv, smp_processor_id());
raw_spin_unlock_irqrestore(&lpriv->lock, flags);
}
@@ -278,7 +278,7 @@ static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu
return;
}
- __imsic_local_timer_start(lpriv);
+ __imsic_local_timer_start(lpriv, cpu);
}
}
#else
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 813b38aec3e4..c40db9c161c1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f0b5a6931161..d098e75e3461 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1364,7 +1364,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
+ bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index cc3d3897ef42..1f626066e8cc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
-
- r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
- if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
- bio_put(outgoing_bio);
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return;
- }
+ bio_add_virt_nofail(outgoing_bio, outgoing_data,
+ ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@@ -3211,7 +3205,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@@ -3228,7 +3223,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6adc55fd90d3..127138c61be5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -14,6 +14,7 @@
#include "raid5.h"
#include "raid10.h"
#include "md-bitmap.h"
+#include "dm-core.h"
#include <linux/device-mapper.h>
@@ -3308,6 +3309,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
+ rs->md.dm_gendisk = ti->table->md->disk;
mddev_unlock(&rs->md);
return 0;
@@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
+ rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..0fde115e921f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
+ *
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -5091,7 +5114,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5120,7 +5143,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5146,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ int io_sectors;
+
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
+ io_sectors = 8;
+ else
+ io_sectors = 128;
+
+ return atomic_read(&mddev->recovery_active) <
+ io_sectors * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1cf00a04bcdd..d45a9e6ead80 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -132,7 +132,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -404,7 +404,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
- struct gendisk *gendisk;
+ struct gendisk *gendisk; /* mdraid gendisk */
+ struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@@ -483,6 +484,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
+ int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@@ -518,6 +520,7 @@ struct mddev {
* adding a spare
*/
+ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
@@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
}
extern void mddev_unlock(struct mddev *mddev);
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
- if (blk_queue_io_stat(bdev->bd_disk->queue))
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
-}
-
-static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
-{
- md_sync_acct(bio->bi_bdev, nr_sectors);
-}
-
struct md_personality
{
struct md_submodule_head head;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index de9bccbe7337..657d481525be 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba32bac975b8..dce06bf65016 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@@ -4880,7 +4873,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
- md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
- md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6389383166c0..ca5b0e8ba707 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1240,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1300,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c
index 09b9ab15e499..a20d03fdd6a9 100644
--- a/drivers/mmc/host/sdhci-of-dwcmshc.c
+++ b/drivers/mmc/host/sdhci-of-dwcmshc.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
#include <linux/pm_runtime.h>
#include <linux/reset.h>
#include <linux/sizes.h>
@@ -745,6 +746,29 @@ static void dwcmshc_rk35xx_postinit(struct sdhci_host *host, struct dwcmshc_priv
}
}
+static void dwcmshc_rk3576_postinit(struct sdhci_host *host, struct dwcmshc_priv *dwc_priv)
+{
+ struct device *dev = mmc_dev(host->mmc);
+ int ret;
+
+ /*
+ * This works around the design of the RK3576's power domains, which
+ * makes the PD_NVM power domain, which the sdhci controller on the
+ * RK3576 is in, never come back the same way once it's run-time
+ * suspended once. This can happen during early kernel boot if no driver
+ * is using either PD_NVM or its child power domain PD_SDGMAC for a
+ * short moment, leading to it being turned off to save power. By
+ * keeping it on, sdhci suspending won't lead to PD_NVM becoming a
+ * candidate for getting turned off.
+ */
+ ret = dev_pm_genpd_rpm_always_on(dev, true);
+ if (ret && ret != -EOPNOTSUPP)
+ dev_warn(dev, "failed to set PD rpm always on, SoC may hang later: %pe\n",
+ ERR_PTR(ret));
+
+ dwcmshc_rk35xx_postinit(host, dwc_priv);
+}
+
static int th1520_execute_tuning(struct sdhci_host *host, u32 opcode)
{
struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
@@ -1176,6 +1200,18 @@ static const struct dwcmshc_pltfm_data sdhci_dwcmshc_rk35xx_pdata = {
.postinit = dwcmshc_rk35xx_postinit,
};
+static const struct dwcmshc_pltfm_data sdhci_dwcmshc_rk3576_pdata = {
+ .pdata = {
+ .ops = &sdhci_dwcmshc_rk35xx_ops,
+ .quirks = SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN |
+ SDHCI_QUIRK_BROKEN_TIMEOUT_VAL,
+ .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN |
+ SDHCI_QUIRK2_CLOCK_DIV_ZERO_BROKEN,
+ },
+ .init = dwcmshc_rk35xx_init,
+ .postinit = dwcmshc_rk3576_postinit,
+};
+
static const struct dwcmshc_pltfm_data sdhci_dwcmshc_th1520_pdata = {
.pdata = {
.ops = &sdhci_dwcmshc_th1520_ops,
@@ -1275,6 +1311,10 @@ static const struct of_device_id sdhci_dwcmshc_dt_ids[] = {
.data = &sdhci_dwcmshc_rk35xx_pdata,
},
{
+ .compatible = "rockchip,rk3576-dwcmshc",
+ .data = &sdhci_dwcmshc_rk3576_pdata,
+ },
+ {
.compatible = "rockchip,rk3568-dwcmshc",
.data = &sdhci_dwcmshc_rk35xx_pdata,
},
diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c
index f75c31815ab0..73385ff4c0f3 100644
--- a/drivers/mmc/host/sdhci_am654.c
+++ b/drivers/mmc/host/sdhci_am654.c
@@ -155,6 +155,7 @@ struct sdhci_am654_data {
u32 tuning_loop;
#define SDHCI_AM654_QUIRK_FORCE_CDTEST BIT(0)
+#define SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA BIT(1)
};
struct window {
@@ -166,6 +167,7 @@ struct window {
struct sdhci_am654_driver_data {
const struct sdhci_pltfm_data *pdata;
u32 flags;
+ u32 quirks;
#define IOMUX_PRESENT (1 << 0)
#define FREQSEL_2_BIT (1 << 1)
#define STRBSEL_4_BIT (1 << 2)
@@ -356,6 +358,29 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host,
sdhci_set_clock(host, clock);
}
+static int sdhci_am654_start_signal_voltage_switch(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+ struct sdhci_host *host = mmc_priv(mmc);
+ struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+ struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host);
+ int ret;
+
+ if ((sdhci_am654->quirks & SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA) &&
+ ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) {
+ if (!IS_ERR(mmc->supply.vqmmc)) {
+ ret = mmc_regulator_set_vqmmc(mmc, ios);
+ if (ret < 0) {
+ pr_err("%s: Switching to 1.8V signalling voltage failed,\n",
+ mmc_hostname(mmc));
+ return -EIO;
+ }
+ }
+ return 0;
+ }
+
+ return sdhci_start_signal_voltage_switch(mmc, ios);
+}
+
static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg)
{
writeb(val, host->ioaddr + reg);
@@ -650,6 +675,12 @@ static const struct sdhci_am654_driver_data sdhci_j721e_4bit_drvdata = {
.flags = IOMUX_PRESENT,
};
+static const struct sdhci_am654_driver_data sdhci_am62_4bit_drvdata = {
+ .pdata = &sdhci_j721e_4bit_pdata,
+ .flags = IOMUX_PRESENT,
+ .quirks = SDHCI_AM654_QUIRK_SUPPRESS_V1P8_ENA,
+};
+
static const struct soc_device_attribute sdhci_am654_devices[] = {
{ .family = "AM65X",
.revision = "SR1.0",
@@ -872,7 +903,7 @@ static const struct of_device_id sdhci_am654_of_match[] = {
},
{
.compatible = "ti,am62-sdhci",
- .data = &sdhci_j721e_4bit_drvdata,
+ .data = &sdhci_am62_4bit_drvdata,
},
{ /* sentinel */ }
};
@@ -906,6 +937,7 @@ static int sdhci_am654_probe(struct platform_device *pdev)
pltfm_host = sdhci_priv(host);
sdhci_am654 = sdhci_pltfm_priv(pltfm_host);
sdhci_am654->flags = drvdata->flags;
+ sdhci_am654->quirks = drvdata->quirks;
clk_xin = devm_clk_get(dev, "clk_xin");
if (IS_ERR(clk_xin)) {
@@ -940,6 +972,7 @@ static int sdhci_am654_probe(struct platform_device *pdev)
goto err_pltfm_free;
}
+ host->mmc_host_ops.start_signal_voltage_switch = sdhci_am654_start_signal_voltage_switch;
host->mmc_host_ops.execute_tuning = sdhci_am654_execute_tuning;
pm_runtime_get_noresume(dev);
diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c
index cf0d51805272..f6921368cd14 100644
--- a/drivers/net/can/kvaser_pciefd.c
+++ b/drivers/net/can/kvaser_pciefd.c
@@ -16,6 +16,7 @@
#include <linux/netdevice.h>
#include <linux/pci.h>
#include <linux/timer.h>
+#include <net/netdev_queues.h>
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Kvaser AB <support@kvaser.com>");
@@ -410,10 +411,13 @@ struct kvaser_pciefd_can {
void __iomem *reg_base;
struct can_berr_counter bec;
u8 cmd_seq;
+ u8 tx_max_count;
+ u8 tx_idx;
+ u8 ack_idx;
int err_rep_cnt;
- int echo_idx;
+ unsigned int completed_tx_pkts;
+ unsigned int completed_tx_bytes;
spinlock_t lock; /* Locks sensitive registers (e.g. MODE) */
- spinlock_t echo_lock; /* Locks the message echo buffer */
struct timer_list bec_poll_timer;
struct completion start_comp, flush_comp;
};
@@ -714,6 +718,9 @@ static int kvaser_pciefd_open(struct net_device *netdev)
int ret;
struct kvaser_pciefd_can *can = netdev_priv(netdev);
+ can->tx_idx = 0;
+ can->ack_idx = 0;
+
ret = open_candev(netdev);
if (ret)
return ret;
@@ -745,21 +752,26 @@ static int kvaser_pciefd_stop(struct net_device *netdev)
timer_delete(&can->bec_poll_timer);
}
can->can.state = CAN_STATE_STOPPED;
+ netdev_reset_queue(netdev);
close_candev(netdev);
return ret;
}
+static unsigned int kvaser_pciefd_tx_avail(const struct kvaser_pciefd_can *can)
+{
+ return can->tx_max_count - (READ_ONCE(can->tx_idx) - READ_ONCE(can->ack_idx));
+}
+
static int kvaser_pciefd_prepare_tx_packet(struct kvaser_pciefd_tx_packet *p,
- struct kvaser_pciefd_can *can,
+ struct can_priv *can, u8 seq,
struct sk_buff *skb)
{
struct canfd_frame *cf = (struct canfd_frame *)skb->data;
int packet_size;
- int seq = can->echo_idx;
memset(p, 0, sizeof(*p));
- if (can->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT)
+ if (can->ctrlmode & CAN_CTRLMODE_ONE_SHOT)
p->header[1] |= KVASER_PCIEFD_TPACKET_SMS;
if (cf->can_id & CAN_RTR_FLAG)
@@ -782,7 +794,7 @@ static int kvaser_pciefd_prepare_tx_packet(struct kvaser_pciefd_tx_packet *p,
} else {
p->header[1] |=
FIELD_PREP(KVASER_PCIEFD_RPACKET_DLC_MASK,
- can_get_cc_dlc((struct can_frame *)cf, can->can.ctrlmode));
+ can_get_cc_dlc((struct can_frame *)cf, can->ctrlmode));
}
p->header[1] |= FIELD_PREP(KVASER_PCIEFD_PACKET_SEQ_MASK, seq);
@@ -797,22 +809,24 @@ static netdev_tx_t kvaser_pciefd_start_xmit(struct sk_buff *skb,
struct net_device *netdev)
{
struct kvaser_pciefd_can *can = netdev_priv(netdev);
- unsigned long irq_flags;
struct kvaser_pciefd_tx_packet packet;
+ unsigned int seq = can->tx_idx & (can->can.echo_skb_max - 1);
+ unsigned int frame_len;
int nr_words;
- u8 count;
if (can_dev_dropped_skb(netdev, skb))
return NETDEV_TX_OK;
+ if (!netif_subqueue_maybe_stop(netdev, 0, kvaser_pciefd_tx_avail(can), 1, 1))
+ return NETDEV_TX_BUSY;
- nr_words = kvaser_pciefd_prepare_tx_packet(&packet, can, skb);
+ nr_words = kvaser_pciefd_prepare_tx_packet(&packet, &can->can, seq, skb);
- spin_lock_irqsave(&can->echo_lock, irq_flags);
/* Prepare and save echo skb in internal slot */
- can_put_echo_skb(skb, netdev, can->echo_idx, 0);
-
- /* Move echo index to the next slot */
- can->echo_idx = (can->echo_idx + 1) % can->can.echo_skb_max;
+ WRITE_ONCE(can->can.echo_skb[seq], NULL);
+ frame_len = can_skb_get_frame_len(skb);
+ can_put_echo_skb(skb, netdev, seq, frame_len);
+ netdev_sent_queue(netdev, frame_len);
+ WRITE_ONCE(can->tx_idx, can->tx_idx + 1);
/* Write header to fifo */
iowrite32(packet.header[0],
@@ -836,14 +850,7 @@ static netdev_tx_t kvaser_pciefd_start_xmit(struct sk_buff *skb,
KVASER_PCIEFD_KCAN_FIFO_LAST_REG);
}
- count = FIELD_GET(KVASER_PCIEFD_KCAN_TX_NR_PACKETS_CURRENT_MASK,
- ioread32(can->reg_base + KVASER_PCIEFD_KCAN_TX_NR_PACKETS_REG));
- /* No room for a new message, stop the queue until at least one
- * successful transmit
- */
- if (count >= can->can.echo_skb_max || can->can.echo_skb[can->echo_idx])
- netif_stop_queue(netdev);
- spin_unlock_irqrestore(&can->echo_lock, irq_flags);
+ netif_subqueue_maybe_stop(netdev, 0, kvaser_pciefd_tx_avail(can), 1, 1);
return NETDEV_TX_OK;
}
@@ -970,6 +977,8 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie)
can->kv_pcie = pcie;
can->cmd_seq = 0;
can->err_rep_cnt = 0;
+ can->completed_tx_pkts = 0;
+ can->completed_tx_bytes = 0;
can->bec.txerr = 0;
can->bec.rxerr = 0;
@@ -983,11 +992,10 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie)
tx_nr_packets_max =
FIELD_GET(KVASER_PCIEFD_KCAN_TX_NR_PACKETS_MAX_MASK,
ioread32(can->reg_base + KVASER_PCIEFD_KCAN_TX_NR_PACKETS_REG));
+ can->tx_max_count = min(KVASER_PCIEFD_CAN_TX_MAX_COUNT, tx_nr_packets_max - 1);
can->can.clock.freq = pcie->freq;
- can->can.echo_skb_max = min(KVASER_PCIEFD_CAN_TX_MAX_COUNT, tx_nr_packets_max - 1);
- can->echo_idx = 0;
- spin_lock_init(&can->echo_lock);
+ can->can.echo_skb_max = roundup_pow_of_two(can->tx_max_count);
spin_lock_init(&can->lock);
can->can.bittiming_const = &kvaser_pciefd_bittiming_const;
@@ -1201,7 +1209,7 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie,
skb = alloc_canfd_skb(priv->dev, &cf);
if (!skb) {
priv->dev->stats.rx_dropped++;
- return -ENOMEM;
+ return 0;
}
cf->len = can_fd_dlc2len(dlc);
@@ -1213,7 +1221,7 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie,
skb = alloc_can_skb(priv->dev, (struct can_frame **)&cf);
if (!skb) {
priv->dev->stats.rx_dropped++;
- return -ENOMEM;
+ return 0;
}
can_frame_set_cc_len((struct can_frame *)cf, dlc, priv->ctrlmode);
}
@@ -1231,7 +1239,9 @@ static int kvaser_pciefd_handle_data_packet(struct kvaser_pciefd *pcie,
priv->dev->stats.rx_packets++;
kvaser_pciefd_set_skb_timestamp(pcie, skb, p->timestamp);
- return netif_rx(skb);
+ netif_rx(skb);
+
+ return 0;
}
static void kvaser_pciefd_change_state(struct kvaser_pciefd_can *can,
@@ -1510,19 +1520,21 @@ static int kvaser_pciefd_handle_ack_packet(struct kvaser_pciefd *pcie,
netdev_dbg(can->can.dev, "Packet was flushed\n");
} else {
int echo_idx = FIELD_GET(KVASER_PCIEFD_PACKET_SEQ_MASK, p->header[0]);
- int len;
- u8 count;
+ unsigned int len, frame_len = 0;
struct sk_buff *skb;
+ if (echo_idx != (can->ack_idx & (can->can.echo_skb_max - 1)))
+ return 0;
skb = can->can.echo_skb[echo_idx];
- if (skb)
- kvaser_pciefd_set_skb_timestamp(pcie, skb, p->timestamp);
- len = can_get_echo_skb(can->can.dev, echo_idx, NULL);
- count = FIELD_GET(KVASER_PCIEFD_KCAN_TX_NR_PACKETS_CURRENT_MASK,
- ioread32(can->reg_base + KVASER_PCIEFD_KCAN_TX_NR_PACKETS_REG));
+ if (!skb)
+ return 0;
+ kvaser_pciefd_set_skb_timestamp(pcie, skb, p->timestamp);
+ len = can_get_echo_skb(can->can.dev, echo_idx, &frame_len);
- if (count < can->can.echo_skb_max && netif_queue_stopped(can->can.dev))
- netif_wake_queue(can->can.dev);
+ /* Pairs with barrier in kvaser_pciefd_start_xmit() */
+ smp_store_release(&can->ack_idx, can->ack_idx + 1);
+ can->completed_tx_pkts++;
+ can->completed_tx_bytes += frame_len;
if (!one_shot_fail) {
can->can.dev->stats.tx_bytes += len;
@@ -1638,32 +1650,51 @@ static int kvaser_pciefd_read_buffer(struct kvaser_pciefd *pcie, int dma_buf)
{
int pos = 0;
int res = 0;
+ unsigned int i;
do {
res = kvaser_pciefd_read_packet(pcie, &pos, dma_buf);
} while (!res && pos > 0 && pos < KVASER_PCIEFD_DMA_SIZE);
+ /* Report ACKs in this buffer to BQL en masse for correct periods */
+ for (i = 0; i < pcie->nr_channels; ++i) {
+ struct kvaser_pciefd_can *can = pcie->can[i];
+
+ if (!can->completed_tx_pkts)
+ continue;
+ netif_subqueue_completed_wake(can->can.dev, 0,
+ can->completed_tx_pkts,
+ can->completed_tx_bytes,
+ kvaser_pciefd_tx_avail(can), 1);
+ can->completed_tx_pkts = 0;
+ can->completed_tx_bytes = 0;
+ }
+
return res;
}
-static u32 kvaser_pciefd_receive_irq(struct kvaser_pciefd *pcie)
+static void kvaser_pciefd_receive_irq(struct kvaser_pciefd *pcie)
{
+ void __iomem *srb_cmd_reg = KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG;
u32 irq = ioread32(KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_IRQ_REG);
- if (irq & KVASER_PCIEFD_SRB_IRQ_DPD0)
+ iowrite32(irq, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_IRQ_REG);
+
+ if (irq & KVASER_PCIEFD_SRB_IRQ_DPD0) {
kvaser_pciefd_read_buffer(pcie, 0);
+ iowrite32(KVASER_PCIEFD_SRB_CMD_RDB0, srb_cmd_reg); /* Rearm buffer */
+ }
- if (irq & KVASER_PCIEFD_SRB_IRQ_DPD1)
+ if (irq & KVASER_PCIEFD_SRB_IRQ_DPD1) {
kvaser_pciefd_read_buffer(pcie, 1);
+ iowrite32(KVASER_PCIEFD_SRB_CMD_RDB1, srb_cmd_reg); /* Rearm buffer */
+ }
if (unlikely(irq & KVASER_PCIEFD_SRB_IRQ_DOF0 ||
irq & KVASER_PCIEFD_SRB_IRQ_DOF1 ||
irq & KVASER_PCIEFD_SRB_IRQ_DUF0 ||
irq & KVASER_PCIEFD_SRB_IRQ_DUF1))
dev_err(&pcie->pci->dev, "DMA IRQ error 0x%08X\n", irq);
-
- iowrite32(irq, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_IRQ_REG);
- return irq;
}
static void kvaser_pciefd_transmit_irq(struct kvaser_pciefd_can *can)
@@ -1691,29 +1722,22 @@ static irqreturn_t kvaser_pciefd_irq_handler(int irq, void *dev)
struct kvaser_pciefd *pcie = (struct kvaser_pciefd *)dev;
const struct kvaser_pciefd_irq_mask *irq_mask = pcie->driver_data->irq_mask;
u32 pci_irq = ioread32(KVASER_PCIEFD_PCI_IRQ_ADDR(pcie));
- u32 srb_irq = 0;
- u32 srb_release = 0;
int i;
if (!(pci_irq & irq_mask->all))
return IRQ_NONE;
+ iowrite32(0, KVASER_PCIEFD_PCI_IEN_ADDR(pcie));
+
if (pci_irq & irq_mask->kcan_rx0)
- srb_irq = kvaser_pciefd_receive_irq(pcie);
+ kvaser_pciefd_receive_irq(pcie);
for (i = 0; i < pcie->nr_channels; i++) {
if (pci_irq & irq_mask->kcan_tx[i])
kvaser_pciefd_transmit_irq(pcie->can[i]);
}
- if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD0)
- srb_release |= KVASER_PCIEFD_SRB_CMD_RDB0;
-
- if (srb_irq & KVASER_PCIEFD_SRB_IRQ_DPD1)
- srb_release |= KVASER_PCIEFD_SRB_CMD_RDB1;
-
- if (srb_release)
- iowrite32(srb_release, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG);
+ iowrite32(irq_mask->all, KVASER_PCIEFD_PCI_IEN_ADDR(pcie));
return IRQ_HANDLED;
}
@@ -1733,13 +1757,22 @@ static void kvaser_pciefd_teardown_can_ctrls(struct kvaser_pciefd *pcie)
}
}
+static void kvaser_pciefd_disable_irq_srcs(struct kvaser_pciefd *pcie)
+{
+ unsigned int i;
+
+ /* Masking PCI_IRQ is insufficient as running ISR will unmask it */
+ iowrite32(0, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_IEN_REG);
+ for (i = 0; i < pcie->nr_channels; ++i)
+ iowrite32(0, pcie->can[i]->reg_base + KVASER_PCIEFD_KCAN_IEN_REG);
+}
+
static int kvaser_pciefd_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
int ret;
struct kvaser_pciefd *pcie;
const struct kvaser_pciefd_irq_mask *irq_mask;
- void __iomem *irq_en_base;
pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
if (!pcie)
@@ -1805,8 +1838,7 @@ static int kvaser_pciefd_probe(struct pci_dev *pdev,
KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_IEN_REG);
/* Enable PCI interrupts */
- irq_en_base = KVASER_PCIEFD_PCI_IEN_ADDR(pcie);
- iowrite32(irq_mask->all, irq_en_base);
+ iowrite32(irq_mask->all, KVASER_PCIEFD_PCI_IEN_ADDR(pcie));
/* Ready the DMA buffers */
iowrite32(KVASER_PCIEFD_SRB_CMD_RDB0,
KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CMD_REG);
@@ -1820,8 +1852,7 @@ static int kvaser_pciefd_probe(struct pci_dev *pdev,
return 0;
err_free_irq:
- /* Disable PCI interrupts */
- iowrite32(0, irq_en_base);
+ kvaser_pciefd_disable_irq_srcs(pcie);
free_irq(pcie->pci->irq, pcie);
err_pci_free_irq_vectors:
@@ -1844,35 +1875,26 @@ err_disable_pci:
return ret;
}
-static void kvaser_pciefd_remove_all_ctrls(struct kvaser_pciefd *pcie)
-{
- int i;
-
- for (i = 0; i < pcie->nr_channels; i++) {
- struct kvaser_pciefd_can *can = pcie->can[i];
-
- if (can) {
- iowrite32(0, can->reg_base + KVASER_PCIEFD_KCAN_IEN_REG);
- unregister_candev(can->can.dev);
- timer_delete(&can->bec_poll_timer);
- kvaser_pciefd_pwm_stop(can);
- free_candev(can->can.dev);
- }
- }
-}
-
static void kvaser_pciefd_remove(struct pci_dev *pdev)
{
struct kvaser_pciefd *pcie = pci_get_drvdata(pdev);
+ unsigned int i;
- kvaser_pciefd_remove_all_ctrls(pcie);
+ for (i = 0; i < pcie->nr_channels; ++i) {
+ struct kvaser_pciefd_can *can = pcie->can[i];
- /* Disable interrupts */
- iowrite32(0, KVASER_PCIEFD_SRB_ADDR(pcie) + KVASER_PCIEFD_SRB_CTRL_REG);
- iowrite32(0, KVASER_PCIEFD_PCI_IEN_ADDR(pcie));
+ unregister_candev(can->can.dev);
+ timer_delete(&can->bec_poll_timer);
+ kvaser_pciefd_pwm_stop(can);
+ }
+ kvaser_pciefd_disable_irq_srcs(pcie);
free_irq(pcie->pci->irq, pcie);
pci_free_irq_vectors(pcie->pci);
+
+ for (i = 0; i < pcie->nr_channels; ++i)
+ free_candev(pcie->can[i]->can.dev);
+
pci_iounmap(pdev, pcie->reg_base);
pci_release_regions(pdev);
pci_disable_device(pdev);
diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c
index 24c6622d36bd..58ff2ec1d975 100644
--- a/drivers/net/can/slcan/slcan-core.c
+++ b/drivers/net/can/slcan/slcan-core.c
@@ -71,12 +71,21 @@ MODULE_AUTHOR("Dario Binacchi <dario.binacchi@amarulasolutions.com>");
#define SLCAN_CMD_LEN 1
#define SLCAN_SFF_ID_LEN 3
#define SLCAN_EFF_ID_LEN 8
+#define SLCAN_DATA_LENGTH_LEN 1
+#define SLCAN_ERROR_LEN 1
#define SLCAN_STATE_LEN 1
#define SLCAN_STATE_BE_RXCNT_LEN 3
#define SLCAN_STATE_BE_TXCNT_LEN 3
-#define SLCAN_STATE_FRAME_LEN (1 + SLCAN_CMD_LEN + \
- SLCAN_STATE_BE_RXCNT_LEN + \
- SLCAN_STATE_BE_TXCNT_LEN)
+#define SLCAN_STATE_MSG_LEN (SLCAN_CMD_LEN + \
+ SLCAN_STATE_LEN + \
+ SLCAN_STATE_BE_RXCNT_LEN + \
+ SLCAN_STATE_BE_TXCNT_LEN)
+#define SLCAN_ERROR_MSG_LEN_MIN (SLCAN_CMD_LEN + \
+ SLCAN_ERROR_LEN + \
+ SLCAN_DATA_LENGTH_LEN)
+#define SLCAN_FRAME_MSG_LEN_MIN (SLCAN_CMD_LEN + \
+ SLCAN_SFF_ID_LEN + \
+ SLCAN_DATA_LENGTH_LEN)
struct slcan {
struct can_priv can;
@@ -176,6 +185,9 @@ static void slcan_bump_frame(struct slcan *sl)
u32 tmpid;
char *cmd = sl->rbuff;
+ if (sl->rcount < SLCAN_FRAME_MSG_LEN_MIN)
+ return;
+
skb = alloc_can_skb(sl->dev, &cf);
if (unlikely(!skb)) {
sl->dev->stats.rx_dropped++;
@@ -281,7 +293,7 @@ static void slcan_bump_state(struct slcan *sl)
return;
}
- if (state == sl->can.state || sl->rcount < SLCAN_STATE_FRAME_LEN)
+ if (state == sl->can.state || sl->rcount != SLCAN_STATE_MSG_LEN)
return;
cmd += SLCAN_STATE_BE_RXCNT_LEN + SLCAN_CMD_LEN + 1;
@@ -328,6 +340,9 @@ static void slcan_bump_err(struct slcan *sl)
bool rx_errors = false, tx_errors = false, rx_over_errors = false;
int i, len;
+ if (sl->rcount < SLCAN_ERROR_MSG_LEN_MIN)
+ return;
+
/* get len from sanitized ASCII value */
len = cmd[1];
if (len >= '0' && len < '9')
@@ -456,8 +471,7 @@ static void slcan_bump(struct slcan *sl)
static void slcan_unesc(struct slcan *sl, unsigned char s)
{
if ((s == '\r') || (s == '\a')) { /* CR or BEL ends the pdu */
- if (!test_and_clear_bit(SLF_ERROR, &sl->flags) &&
- sl->rcount > 4)
+ if (!test_and_clear_bit(SLF_ERROR, &sl->flags))
slcan_bump(sl);
sl->rcount = 0;
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 9eb39cfa5fb2..7216eb8f9493 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -326,6 +326,26 @@ static void b53_get_vlan_entry(struct b53_device *dev, u16 vid,
}
}
+static void b53_set_eap_mode(struct b53_device *dev, int port, int mode)
+{
+ u64 eap_conf;
+
+ if (is5325(dev) || is5365(dev) || dev->chip_id == BCM5389_DEVICE_ID)
+ return;
+
+ b53_read64(dev, B53_EAP_PAGE, B53_PORT_EAP_CONF(port), &eap_conf);
+
+ if (is63xx(dev)) {
+ eap_conf &= ~EAP_MODE_MASK_63XX;
+ eap_conf |= (u64)mode << EAP_MODE_SHIFT_63XX;
+ } else {
+ eap_conf &= ~EAP_MODE_MASK;
+ eap_conf |= (u64)mode << EAP_MODE_SHIFT;
+ }
+
+ b53_write64(dev, B53_EAP_PAGE, B53_PORT_EAP_CONF(port), eap_conf);
+}
+
static void b53_set_forwarding(struct b53_device *dev, int enable)
{
u8 mgmt;
@@ -586,6 +606,13 @@ int b53_setup_port(struct dsa_switch *ds, int port)
b53_port_set_mcast_flood(dev, port, true);
b53_port_set_learning(dev, port, false);
+ /* Force all traffic to go to the CPU port to prevent the ASIC from
+ * trying to forward to bridged ports on matching FDB entries, then
+ * dropping frames because it isn't allowed to forward there.
+ */
+ if (dsa_is_user_port(ds, port))
+ b53_set_eap_mode(dev, port, EAP_MODE_SIMPLIFIED);
+
return 0;
}
EXPORT_SYMBOL(b53_setup_port);
@@ -2042,6 +2069,9 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge,
pvlan |= BIT(i);
}
+ /* Disable redirection of unknown SA to the CPU port */
+ b53_set_eap_mode(dev, port, EAP_MODE_BASIC);
+
/* Configure the local port VLAN control membership to include
* remote ports and update the local port bitmask
*/
@@ -2077,6 +2107,9 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge)
pvlan &= ~BIT(i);
}
+ /* Enable redirection of unknown SA to the CPU port */
+ b53_set_eap_mode(dev, port, EAP_MODE_SIMPLIFIED);
+
b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), pvlan);
dev->ports[port].vlan_ctl_mask = pvlan;
diff --git a/drivers/net/dsa/b53/b53_regs.h b/drivers/net/dsa/b53/b53_regs.h
index bfbcb66bef66..5f7a0e5c5709 100644
--- a/drivers/net/dsa/b53/b53_regs.h
+++ b/drivers/net/dsa/b53/b53_regs.h
@@ -50,6 +50,9 @@
/* Jumbo Frame Registers */
#define B53_JUMBO_PAGE 0x40
+/* EAP Registers */
+#define B53_EAP_PAGE 0x42
+
/* EEE Control Registers Page */
#define B53_EEE_PAGE 0x92
@@ -481,6 +484,17 @@
#define JMS_MAX_SIZE 9724
/*************************************************************************
+ * EAP Page Registers
+ *************************************************************************/
+#define B53_PORT_EAP_CONF(i) (0x20 + 8 * (i))
+#define EAP_MODE_SHIFT 51
+#define EAP_MODE_SHIFT_63XX 50
+#define EAP_MODE_MASK (0x3ull << EAP_MODE_SHIFT)
+#define EAP_MODE_MASK_63XX (0x3ull << EAP_MODE_SHIFT_63XX)
+#define EAP_MODE_BASIC 0
+#define EAP_MODE_SIMPLIFIED 3
+
+/*************************************************************************
* EEE Configuration Page Registers
*************************************************************************/
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 89f0796894af..f95a9aac56ee 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -265,16 +265,70 @@ static void ksz_phylink_mac_link_down(struct phylink_config *config,
unsigned int mode,
phy_interface_t interface);
+/**
+ * ksz_phylink_mac_disable_tx_lpi() - Callback to signal LPI support (Dummy)
+ * @config: phylink config structure
+ *
+ * This function is a dummy handler. See ksz_phylink_mac_enable_tx_lpi() for
+ * a detailed explanation of EEE/LPI handling in KSZ switches.
+ */
+static void ksz_phylink_mac_disable_tx_lpi(struct phylink_config *config)
+{
+}
+
+/**
+ * ksz_phylink_mac_enable_tx_lpi() - Callback to signal LPI support (Dummy)
+ * @config: phylink config structure
+ * @timer: timer value before entering LPI (unused)
+ * @tx_clock_stop: whether to stop the TX clock in LPI mode (unused)
+ *
+ * This function signals to phylink that the driver architecture supports
+ * LPI management, enabling phylink to control EEE advertisement during
+ * negotiation according to IEEE Std 802.3 (Clause 78).
+ *
+ * Hardware Management of EEE/LPI State:
+ * For KSZ switch ports with integrated PHYs (e.g., KSZ9893R ports 1-2),
+ * observation and testing suggest that the actual EEE / Low Power Idle (LPI)
+ * state transitions are managed autonomously by the hardware based on
+ * the auto-negotiation results. (Note: While the datasheet describes EEE
+ * operation based on negotiation, it doesn't explicitly detail the internal
+ * MAC/PHY interaction, so autonomous hardware management of the MAC state
+ * for LPI is inferred from observed behavior).
+ * This hardware control, consistent with the switch's ability to operate
+ * autonomously via strapping, means MAC-level software intervention is not
+ * required or exposed for managing the LPI state once EEE is negotiated.
+ * (Ref: KSZ9893R Data Sheet DS00002420D, primarily Section 4.7.5 explaining
+ * EEE, also Sections 4.1.7 on Auto-Negotiation and 3.2.1 on Configuration
+ * Straps).
+ *
+ * Additionally, ports configured as MAC interfaces (e.g., KSZ9893R port 3)
+ * lack documented MAC-level LPI control.
+ *
+ * Therefore, this callback performs no action and serves primarily to inform
+ * phylink of LPI awareness and to document the inferred hardware behavior.
+ *
+ * Returns: 0 (Always success)
+ */
+static int ksz_phylink_mac_enable_tx_lpi(struct phylink_config *config,
+ u32 timer, bool tx_clock_stop)
+{
+ return 0;
+}
+
static const struct phylink_mac_ops ksz88x3_phylink_mac_ops = {
.mac_config = ksz88x3_phylink_mac_config,
.mac_link_down = ksz_phylink_mac_link_down,
.mac_link_up = ksz8_phylink_mac_link_up,
+ .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi,
+ .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi,
};
static const struct phylink_mac_ops ksz8_phylink_mac_ops = {
.mac_config = ksz_phylink_mac_config,
.mac_link_down = ksz_phylink_mac_link_down,
.mac_link_up = ksz8_phylink_mac_link_up,
+ .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi,
+ .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi,
};
static const struct ksz_dev_ops ksz88xx_dev_ops = {
@@ -358,6 +412,8 @@ static const struct phylink_mac_ops ksz9477_phylink_mac_ops = {
.mac_config = ksz_phylink_mac_config,
.mac_link_down = ksz_phylink_mac_link_down,
.mac_link_up = ksz9477_phylink_mac_link_up,
+ .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi,
+ .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi,
};
static const struct ksz_dev_ops ksz9477_dev_ops = {
@@ -401,6 +457,8 @@ static const struct phylink_mac_ops lan937x_phylink_mac_ops = {
.mac_config = ksz_phylink_mac_config,
.mac_link_down = ksz_phylink_mac_link_down,
.mac_link_up = ksz9477_phylink_mac_link_up,
+ .mac_disable_tx_lpi = ksz_phylink_mac_disable_tx_lpi,
+ .mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi,
};
static const struct ksz_dev_ops lan937x_dev_ops = {
@@ -2016,6 +2074,18 @@ static void ksz_phylink_get_caps(struct dsa_switch *ds, int port,
if (dev->dev_ops->get_caps)
dev->dev_ops->get_caps(dev, port, config);
+
+ if (ds->ops->support_eee && ds->ops->support_eee(ds, port)) {
+ memcpy(config->lpi_interfaces, config->supported_interfaces,
+ sizeof(config->lpi_interfaces));
+
+ config->lpi_capabilities = MAC_100FD;
+ if (dev->info->gbit_capable[port])
+ config->lpi_capabilities |= MAC_1000FD;
+
+ /* EEE is fully operational */
+ config->eee_enabled_default = true;
+ }
}
void ksz_r_mib_stats64(struct ksz_device *dev, int port)
@@ -3008,31 +3078,6 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port)
if (!port)
return MICREL_KSZ8_P1_ERRATA;
break;
- case KSZ8567_CHIP_ID:
- /* KSZ8567R Errata DS80000752C Module 4 */
- case KSZ8765_CHIP_ID:
- case KSZ8794_CHIP_ID:
- case KSZ8795_CHIP_ID:
- /* KSZ879x/KSZ877x/KSZ876x Errata DS80000687C Module 2 */
- case KSZ9477_CHIP_ID:
- /* KSZ9477S Errata DS80000754A Module 4 */
- case KSZ9567_CHIP_ID:
- /* KSZ9567S Errata DS80000756A Module 4 */
- case KSZ9896_CHIP_ID:
- /* KSZ9896C Errata DS80000757A Module 3 */
- case KSZ9897_CHIP_ID:
- case LAN9646_CHIP_ID:
- /* KSZ9897R Errata DS80000758C Module 4 */
- /* Energy Efficient Ethernet (EEE) feature select must be manually disabled
- * The EEE feature is enabled by default, but it is not fully
- * operational. It must be manually disabled through register
- * controls. If not disabled, the PHY ports can auto-negotiate
- * to enable EEE, and this feature can cause link drops when
- * linked to another device supporting EEE.
- *
- * The same item appears in the errata for all switches above.
- */
- return MICREL_NO_EEE;
}
return 0;
@@ -3466,6 +3511,20 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port)
return -EOPNOTSUPP;
}
+/**
+ * ksz_support_eee - Determine Energy Efficient Ethernet (EEE) support for a
+ * port
+ * @ds: Pointer to the DSA switch structure
+ * @port: Port number to check
+ *
+ * This function also documents devices where EEE was initially advertised but
+ * later withdrawn due to reliability issues, as described in official errata
+ * documents. These devices are explicitly listed to record known limitations,
+ * even if there is no technical necessity for runtime checks.
+ *
+ * Returns: true if the internal PHY on the given port supports fully
+ * operational EEE, false otherwise.
+ */
static bool ksz_support_eee(struct dsa_switch *ds, int port)
{
struct ksz_device *dev = ds->priv;
@@ -3475,15 +3534,35 @@ static bool ksz_support_eee(struct dsa_switch *ds, int port)
switch (dev->chip_id) {
case KSZ8563_CHIP_ID:
+ case KSZ9563_CHIP_ID:
+ case KSZ9893_CHIP_ID:
+ return true;
case KSZ8567_CHIP_ID:
+ /* KSZ8567R Errata DS80000752C Module 4 */
+ case KSZ8765_CHIP_ID:
+ case KSZ8794_CHIP_ID:
+ case KSZ8795_CHIP_ID:
+ /* KSZ879x/KSZ877x/KSZ876x Errata DS80000687C Module 2 */
case KSZ9477_CHIP_ID:
- case KSZ9563_CHIP_ID:
+ /* KSZ9477S Errata DS80000754A Module 4 */
case KSZ9567_CHIP_ID:
- case KSZ9893_CHIP_ID:
+ /* KSZ9567S Errata DS80000756A Module 4 */
case KSZ9896_CHIP_ID:
+ /* KSZ9896C Errata DS80000757A Module 3 */
case KSZ9897_CHIP_ID:
case LAN9646_CHIP_ID:
- return true;
+ /* KSZ9897R Errata DS80000758C Module 4 */
+ /* Energy Efficient Ethernet (EEE) feature select must be
+ * manually disabled
+ * The EEE feature is enabled by default, but it is not fully
+ * operational. It must be manually disabled through register
+ * controls. If not disabled, the PHY ports can auto-negotiate
+ * to enable EEE, and this feature can cause link drops when
+ * linked to another device supporting EEE.
+ *
+ * The same item appears in the errata for all switches above.
+ */
+ break;
}
return false;
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index f8454f3b6f9c..f674c400f05b 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -2081,6 +2081,7 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port,
switch (state) {
case BR_STATE_DISABLED:
case BR_STATE_BLOCKING:
+ case BR_STATE_LISTENING:
/* From UM10944 description of DRPDTAG (why put this there?):
* "Management traffic flows to the port regardless of the state
* of the INGRESS flag". So BPDUs are still be allowed to pass.
@@ -2090,11 +2091,6 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port,
mac[port].egress = false;
mac[port].dyn_learn = false;
break;
- case BR_STATE_LISTENING:
- mac[port].ingress = true;
- mac[port].egress = false;
- mac[port].dyn_learn = false;
- break;
case BR_STATE_LEARNING:
mac[port].ingress = true;
mac[port].egress = false;
diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index d748dc6de923..1e9ab65218ff 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -614,7 +614,6 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
struct airoha_queue_entry *e = &q->entry[q->tail];
struct airoha_qdma_desc *desc = &q->desc[q->tail];
u32 hash, reason, msg1 = le32_to_cpu(desc->msg1);
- dma_addr_t dma_addr = le32_to_cpu(desc->addr);
struct page *page = virt_to_head_page(e->buf);
u32 desc_ctrl = le32_to_cpu(desc->ctrl);
struct airoha_gdm_port *port;
@@ -623,22 +622,16 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
if (!(desc_ctrl & QDMA_DESC_DONE_MASK))
break;
- if (!dma_addr)
- break;
-
- len = FIELD_GET(QDMA_DESC_LEN_MASK, desc_ctrl);
- if (!len)
- break;
-
q->tail = (q->tail + 1) % q->ndesc;
q->queued--;
- dma_sync_single_for_cpu(eth->dev, dma_addr,
+ dma_sync_single_for_cpu(eth->dev, e->dma_addr,
SKB_WITH_OVERHEAD(q->buf_size), dir);
+ len = FIELD_GET(QDMA_DESC_LEN_MASK, desc_ctrl);
data_len = q->skb ? q->buf_size
: SKB_WITH_OVERHEAD(q->buf_size);
- if (data_len < len)
+ if (!len || data_len < len)
goto free_frag;
p = airoha_qdma_get_gdm_port(eth, desc);
@@ -701,9 +694,12 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget)
q->skb = NULL;
continue;
free_frag:
- page_pool_put_full_page(q->page_pool, page, true);
- dev_kfree_skb(q->skb);
- q->skb = NULL;
+ if (q->skb) {
+ dev_kfree_skb(q->skb);
+ q->skb = NULL;
+ } else {
+ page_pool_put_full_page(q->page_pool, page, true);
+ }
}
airoha_qdma_fill_rx_queue(q);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 86a5de44b6f3..6afc2ab6fad2 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -14013,13 +14013,28 @@ static void bnxt_unlock_sp(struct bnxt *bp)
netdev_unlock(bp->dev);
}
+/* Same as bnxt_lock_sp() with additional rtnl_lock */
+static void bnxt_rtnl_lock_sp(struct bnxt *bp)
+{
+ clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+ rtnl_lock();
+ netdev_lock(bp->dev);
+}
+
+static void bnxt_rtnl_unlock_sp(struct bnxt *bp)
+{
+ set_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+ netdev_unlock(bp->dev);
+ rtnl_unlock();
+}
+
/* Only called from bnxt_sp_task() */
static void bnxt_reset(struct bnxt *bp, bool silent)
{
- bnxt_lock_sp(bp);
+ bnxt_rtnl_lock_sp(bp);
if (test_bit(BNXT_STATE_OPEN, &bp->state))
bnxt_reset_task(bp, silent);
- bnxt_unlock_sp(bp);
+ bnxt_rtnl_unlock_sp(bp);
}
/* Only called from bnxt_sp_task() */
@@ -14027,9 +14042,9 @@ static void bnxt_rx_ring_reset(struct bnxt *bp)
{
int i;
- bnxt_lock_sp(bp);
+ bnxt_rtnl_lock_sp(bp);
if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
- bnxt_unlock_sp(bp);
+ bnxt_rtnl_unlock_sp(bp);
return;
}
/* Disable and flush TPA before resetting the RX ring */
@@ -14068,7 +14083,7 @@ static void bnxt_rx_ring_reset(struct bnxt *bp)
}
if (bp->flags & BNXT_FLAG_TPA)
bnxt_set_tpa(bp, true);
- bnxt_unlock_sp(bp);
+ bnxt_rtnl_unlock_sp(bp);
}
static void bnxt_fw_fatal_close(struct bnxt *bp)
@@ -14960,15 +14975,17 @@ static void bnxt_fw_reset_task(struct work_struct *work)
bp->fw_reset_state = BNXT_FW_RESET_STATE_OPENING;
fallthrough;
case BNXT_FW_RESET_STATE_OPENING:
- while (!netdev_trylock(bp->dev)) {
+ while (!rtnl_trylock()) {
bnxt_queue_fw_reset_work(bp, HZ / 10);
return;
}
+ netdev_lock(bp->dev);
rc = bnxt_open(bp->dev);
if (rc) {
netdev_err(bp->dev, "bnxt_open() failed during FW reset\n");
bnxt_fw_reset_abort(bp, rc);
netdev_unlock(bp->dev);
+ rtnl_unlock();
goto ulp_start;
}
@@ -14988,6 +15005,7 @@ static void bnxt_fw_reset_task(struct work_struct *work)
bnxt_dl_health_fw_status_update(bp, true);
}
netdev_unlock(bp->dev);
+ rtnl_unlock();
bnxt_ulp_start(bp, 0);
bnxt_reenable_sriov(bp);
netdev_lock(bp->dev);
@@ -15936,7 +15954,7 @@ err_reset:
rc);
napi_enable_locked(&bnapi->napi);
bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons);
- bnxt_reset_task(bp, true);
+ netif_close(dev);
return rc;
}
@@ -16752,6 +16770,7 @@ static int bnxt_resume(struct device *device)
struct bnxt *bp = netdev_priv(dev);
int rc = 0;
+ rtnl_lock();
netdev_lock(dev);
rc = pci_enable_device(bp->pdev);
if (rc) {
@@ -16796,6 +16815,7 @@ static int bnxt_resume(struct device *device)
resume_exit:
netdev_unlock(bp->dev);
+ rtnl_unlock();
bnxt_ulp_start(bp, rc);
if (!rc)
bnxt_reenable_sriov(bp);
@@ -16961,6 +16981,7 @@ static void bnxt_io_resume(struct pci_dev *pdev)
int err;
netdev_info(bp->dev, "PCI Slot Resume\n");
+ rtnl_lock();
netdev_lock(netdev);
err = bnxt_hwrm_func_qcaps(bp);
@@ -16978,6 +16999,7 @@ static void bnxt_io_resume(struct pci_dev *pdev)
netif_device_attach(netdev);
netdev_unlock(netdev);
+ rtnl_unlock();
bnxt_ulp_start(bp, err);
if (!err)
bnxt_reenable_sriov(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index a8e930d5dbb0..7564705d6478 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -20,6 +20,7 @@
#include <asm/byteorder.h>
#include <linux/bitmap.h>
#include <linux/auxiliary_bus.h>
+#include <net/netdev_lock.h>
#include "bnxt_hsi.h"
#include "bnxt.h"
@@ -309,14 +310,12 @@ void bnxt_ulp_irq_stop(struct bnxt *bp)
if (!ulp->msix_requested)
return;
- netdev_lock(bp->dev);
- ops = rcu_dereference(ulp->ulp_ops);
+ ops = netdev_lock_dereference(ulp->ulp_ops, bp->dev);
if (!ops || !ops->ulp_irq_stop)
return;
if (test_bit(BNXT_STATE_FW_RESET_DET, &bp->state))
reset = true;
ops->ulp_irq_stop(ulp->handle, reset);
- netdev_unlock(bp->dev);
}
}
@@ -335,8 +334,7 @@ void bnxt_ulp_irq_restart(struct bnxt *bp, int err)
if (!ulp->msix_requested)
return;
- netdev_lock(bp->dev);
- ops = rcu_dereference(ulp->ulp_ops);
+ ops = netdev_lock_dereference(ulp->ulp_ops, bp->dev);
if (!ops || !ops->ulp_irq_restart)
return;
@@ -348,7 +346,6 @@ void bnxt_ulp_irq_restart(struct bnxt *bp, int err)
bnxt_fill_msix_vecs(bp, ent);
}
ops->ulp_irq_restart(ulp->handle, ent);
- netdev_unlock(bp->dev);
kfree(ent);
}
}
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 1fe8ec37491b..e1e8bd2ec155 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -997,22 +997,15 @@ static void macb_update_stats(struct macb *bp)
static int macb_halt_tx(struct macb *bp)
{
- unsigned long halt_time, timeout;
- u32 status;
+ u32 status;
macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(THALT));
- timeout = jiffies + usecs_to_jiffies(MACB_HALT_TIMEOUT);
- do {
- halt_time = jiffies;
- status = macb_readl(bp, TSR);
- if (!(status & MACB_BIT(TGO)))
- return 0;
-
- udelay(250);
- } while (time_before(halt_time, timeout));
-
- return -ETIMEDOUT;
+ /* Poll TSR until TGO is cleared or timeout. */
+ return read_poll_timeout_atomic(macb_readl, status,
+ !(status & MACB_BIT(TGO)),
+ 250, MACB_HALT_TIMEOUT, false,
+ bp, TSR);
}
static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb, int budget)
diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index 625245b0845c..eba73246f986 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -67,6 +67,8 @@
#define TSNEP_TX_TYPE_XDP_NDO_MAP_PAGE (TSNEP_TX_TYPE_XDP_NDO | TSNEP_TX_TYPE_MAP_PAGE)
#define TSNEP_TX_TYPE_XDP (TSNEP_TX_TYPE_XDP_TX | TSNEP_TX_TYPE_XDP_NDO)
#define TSNEP_TX_TYPE_XSK BIT(12)
+#define TSNEP_TX_TYPE_TSTAMP BIT(13)
+#define TSNEP_TX_TYPE_SKB_TSTAMP (TSNEP_TX_TYPE_SKB | TSNEP_TX_TYPE_TSTAMP)
#define TSNEP_XDP_TX BIT(0)
#define TSNEP_XDP_REDIRECT BIT(1)
@@ -386,8 +388,7 @@ static void tsnep_tx_activate(struct tsnep_tx *tx, int index, int length,
if (entry->skb) {
entry->properties = length & TSNEP_DESC_LENGTH_MASK;
entry->properties |= TSNEP_DESC_INTERRUPT_FLAG;
- if ((entry->type & TSNEP_TX_TYPE_SKB) &&
- (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS))
+ if ((entry->type & TSNEP_TX_TYPE_SKB_TSTAMP) == TSNEP_TX_TYPE_SKB_TSTAMP)
entry->properties |= TSNEP_DESC_EXTENDED_WRITEBACK_FLAG;
/* toggle user flag to prevent false acknowledge
@@ -479,7 +480,8 @@ static int tsnep_tx_map_frag(skb_frag_t *frag, struct tsnep_tx_entry *entry,
return mapped;
}
-static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count)
+static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count,
+ bool do_tstamp)
{
struct device *dmadev = tx->adapter->dmadev;
struct tsnep_tx_entry *entry;
@@ -505,6 +507,9 @@ static int tsnep_tx_map(struct sk_buff *skb, struct tsnep_tx *tx, int count)
entry->type = TSNEP_TX_TYPE_SKB_INLINE;
mapped = 0;
}
+
+ if (do_tstamp)
+ entry->type |= TSNEP_TX_TYPE_TSTAMP;
} else {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
@@ -558,11 +563,12 @@ static int tsnep_tx_unmap(struct tsnep_tx *tx, int index, int count)
static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb,
struct tsnep_tx *tx)
{
- int count = 1;
struct tsnep_tx_entry *entry;
+ bool do_tstamp = false;
+ int count = 1;
int length;
- int i;
int retval;
+ int i;
if (skb_shinfo(skb)->nr_frags > 0)
count += skb_shinfo(skb)->nr_frags;
@@ -579,7 +585,13 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb,
entry = &tx->entry[tx->write];
entry->skb = skb;
- retval = tsnep_tx_map(skb, tx, count);
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
+ tx->adapter->hwtstamp_config.tx_type == HWTSTAMP_TX_ON) {
+ skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+ do_tstamp = true;
+ }
+
+ retval = tsnep_tx_map(skb, tx, count, do_tstamp);
if (retval < 0) {
tsnep_tx_unmap(tx, tx->write, count);
dev_kfree_skb_any(entry->skb);
@@ -591,9 +603,6 @@ static netdev_tx_t tsnep_xmit_frame_ring(struct sk_buff *skb,
}
length = retval;
- if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)
- skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
-
for (i = 0; i < count; i++)
tsnep_tx_activate(tx, (tx->write + i) & TSNEP_RING_MASK, length,
i == count - 1);
@@ -844,8 +853,7 @@ static bool tsnep_tx_poll(struct tsnep_tx *tx, int napi_budget)
length = tsnep_tx_unmap(tx, tx->read, count);
- if ((entry->type & TSNEP_TX_TYPE_SKB) &&
- (skb_shinfo(entry->skb)->tx_flags & SKBTX_IN_PROGRESS) &&
+ if (((entry->type & TSNEP_TX_TYPE_SKB_TSTAMP) == TSNEP_TX_TYPE_SKB_TSTAMP) &&
(__le32_to_cpu(entry->desc_wb->properties) &
TSNEP_DESC_EXTENDED_WRITEBACK_FLAG)) {
struct skb_shared_hwtstamps hwtstamps;
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
index a0bcfb5a713d..ff3295b60a69 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_err.c
@@ -61,6 +61,8 @@ static int hbg_reset_prepare(struct hbg_priv *priv, enum hbg_reset_type type)
return -EBUSY;
}
+ netif_device_detach(priv->netdev);
+
priv->reset_type = type;
set_bit(HBG_NIC_STATE_RESETTING, &priv->state);
clear_bit(HBG_NIC_STATE_RESET_FAIL, &priv->state);
@@ -91,6 +93,8 @@ static int hbg_reset_done(struct hbg_priv *priv, enum hbg_reset_type type)
return ret;
}
+ netif_device_attach(priv->netdev);
+
dev_info(&priv->pdev->dev, "reset done\n");
return ret;
}
@@ -117,16 +121,13 @@ void hbg_err_reset(struct hbg_priv *priv)
if (running)
dev_close(priv->netdev);
- hbg_reset(priv);
-
- /* in hbg_pci_err_detected(), we will detach first,
- * so we need to attach before open
- */
- if (!netif_device_present(priv->netdev))
- netif_device_attach(priv->netdev);
+ if (hbg_reset(priv))
+ goto err_unlock;
if (running)
dev_open(priv->netdev, NULL);
+
+err_unlock:
rtnl_unlock();
}
@@ -160,7 +161,6 @@ static pci_ers_result_t hbg_pci_err_slot_reset(struct pci_dev *pdev)
pci_save_state(pdev);
hbg_err_reset(priv);
- netif_device_attach(netdev);
return PCI_ERS_RESULT_RECOVERED;
}
diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c
index 8f1107b85fbb..55520053270a 100644
--- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_ethtool.c
@@ -317,6 +317,9 @@ static void hbg_update_stats_by_info(struct hbg_priv *priv,
const struct hbg_ethtool_stats *stats;
u32 i;
+ if (test_bit(HBG_NIC_STATE_RESETTING, &priv->state))
+ return;
+
for (i = 0; i < info_len; i++) {
stats = &info[i];
if (!stats->reg)
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c
index 22371011c249..2410aee59fb2 100644
--- a/drivers/net/ethernet/intel/ice/ice_lag.c
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -1321,12 +1321,18 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr)
*/
if (!primary_lag) {
lag->primary = true;
+ if (!ice_is_switchdev_running(lag->pf))
+ return;
+
/* Configure primary's SWID to be shared */
ice_lag_primary_swid(lag, true);
primary_lag = lag;
} else {
u16 swid;
+ if (!ice_is_switchdev_running(primary_lag->pf))
+ return;
+
swid = primary_lag->pf->hw.port_info->sw_id;
ice_lag_set_swid(swid, lag, true);
ice_lag_add_prune_list(primary_lag, lag->pf);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 7c3006eb68dd..6446d0fcc052 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -4275,7 +4275,6 @@ static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg)
}
ice_vfhw_mac_add(vf, &al->list[i]);
- vf->num_mac++;
break;
}
diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h
index aef0e9775a33..70dbf80f3bb7 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -143,6 +143,7 @@ enum idpf_vport_state {
* @vport_id: Vport identifier
* @link_speed_mbps: Link speed in mbps
* @vport_idx: Relative vport index
+ * @max_tx_hdr_size: Max header length hardware can support
* @state: See enum idpf_vport_state
* @netstats: Packet and byte stats
* @stats_lock: Lock to protect stats update
@@ -153,6 +154,7 @@ struct idpf_netdev_priv {
u32 vport_id;
u32 link_speed_mbps;
u16 vport_idx;
+ u16 max_tx_hdr_size;
enum idpf_vport_state state;
struct rtnl_link_stats64 netstats;
spinlock_t stats_lock;
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 82f09b4030bc..3a033ce19cda 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -723,6 +723,7 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
np->vport = vport;
np->vport_idx = vport->idx;
np->vport_id = vport->vport_id;
+ np->max_tx_hdr_size = idpf_get_max_tx_hdr_size(adapter);
vport->netdev = netdev;
return idpf_init_mac_addr(vport, netdev);
@@ -740,6 +741,7 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
np->adapter = adapter;
np->vport_idx = vport->idx;
np->vport_id = vport->vport_id;
+ np->max_tx_hdr_size = idpf_get_max_tx_hdr_size(adapter);
spin_lock_init(&np->stats_lock);
@@ -2203,8 +2205,8 @@ static netdev_features_t idpf_features_check(struct sk_buff *skb,
struct net_device *netdev,
netdev_features_t features)
{
- struct idpf_vport *vport = idpf_netdev_to_vport(netdev);
- struct idpf_adapter *adapter = vport->adapter;
+ struct idpf_netdev_priv *np = netdev_priv(netdev);
+ u16 max_tx_hdr_size = np->max_tx_hdr_size;
size_t len;
/* No point in doing any of this if neither checksum nor GSO are
@@ -2227,7 +2229,7 @@ static netdev_features_t idpf_features_check(struct sk_buff *skb,
goto unsupported;
len = skb_network_header_len(skb);
- if (unlikely(len > idpf_get_max_tx_hdr_size(adapter)))
+ if (unlikely(len > max_tx_hdr_size))
goto unsupported;
if (!skb->encapsulation)
@@ -2240,7 +2242,7 @@ static netdev_features_t idpf_features_check(struct sk_buff *skb,
/* IPLEN can support at most 127 dwords */
len = skb_inner_network_header_len(skb);
- if (unlikely(len > idpf_get_max_tx_hdr_size(adapter)))
+ if (unlikely(len > max_tx_hdr_size))
goto unsupported;
/* No need to validate L4LEN as TCP is the only protocol with a
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index bdf52cef3891..2d5f5c9f91ce 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -4025,6 +4025,14 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget)
return budget;
}
+ /* Switch to poll mode in the tear-down path after sending disable
+ * queues virtchnl message, as the interrupts will be disabled after
+ * that.
+ */
+ if (unlikely(q_vector->num_txq && idpf_queue_has(POLL_MODE,
+ q_vector->tx[0])))
+ return budget;
+
work_done = min_t(int, work_done, budget - 1);
/* Exit the polling mode, but don't re-enable interrupts if stack might
@@ -4035,15 +4043,7 @@ static int idpf_vport_splitq_napi_poll(struct napi_struct *napi, int budget)
else
idpf_vport_intr_set_wb_on_itr(q_vector);
- /* Switch to poll mode in the tear-down path after sending disable
- * queues virtchnl message, as the interrupts will be disabled after
- * that
- */
- if (unlikely(q_vector->num_txq && idpf_queue_has(POLL_MODE,
- q_vector->tx[0])))
- return budget;
- else
- return work_done;
+ return work_done;
}
/**
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
index 0b27a695008b..971993586fb4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
@@ -717,6 +717,11 @@ int cgx_get_rx_stats(void *cgxd, int lmac_id, int idx, u64 *rx_stat)
if (!is_lmac_valid(cgx, lmac_id))
return -ENODEV;
+
+ /* pass lmac as 0 for CGX_CMR_RX_STAT9-12 */
+ if (idx >= CGX_RX_STAT_GLOBAL_INDEX)
+ lmac_id = 0;
+
*rx_stat = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_STAT0 + (idx * 8));
return 0;
}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
index 7fa98aeb3663..4a3370a40dd8 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
@@ -13,19 +13,26 @@
/* RVU LMTST */
#define LMT_TBL_OP_READ 0
#define LMT_TBL_OP_WRITE 1
-#define LMT_MAP_TABLE_SIZE (128 * 1024)
#define LMT_MAPTBL_ENTRY_SIZE 16
+#define LMT_MAX_VFS 256
+
+#define LMT_MAP_ENTRY_ENA BIT_ULL(20)
+#define LMT_MAP_ENTRY_LINES GENMASK_ULL(18, 16)
/* Function to perform operations (read/write) on lmtst map table */
static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
int lmt_tbl_op)
{
void __iomem *lmt_map_base;
- u64 tbl_base;
+ u64 tbl_base, cfg;
+ int pfs, vfs;
tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE);
+ cfg = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG);
+ vfs = 1 << (cfg & 0xF);
+ pfs = 1 << ((cfg >> 4) & 0x7);
- lmt_map_base = ioremap_wc(tbl_base, LMT_MAP_TABLE_SIZE);
+ lmt_map_base = ioremap_wc(tbl_base, pfs * vfs * LMT_MAPTBL_ENTRY_SIZE);
if (!lmt_map_base) {
dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n");
return -ENOMEM;
@@ -35,6 +42,13 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
*val = readq(lmt_map_base + index);
} else {
writeq((*val), (lmt_map_base + index));
+
+ cfg = FIELD_PREP(LMT_MAP_ENTRY_ENA, 0x1);
+ /* 2048 LMTLINES */
+ cfg |= FIELD_PREP(LMT_MAP_ENTRY_LINES, 0x6);
+
+ writeq(cfg, (lmt_map_base + (index + 8)));
+
/* Flushing the AP interceptor cache to make APR_LMT_MAP_ENTRY_S
* changes effective. Write 1 for flush and read is being used as a
* barrier and sets up a data dependency. Write to 0 after a write
@@ -52,7 +66,7 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
#define LMT_MAP_TBL_W1_OFF 8
static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc)
{
- return ((rvu_get_pf(pcifunc) * rvu->hw->total_vfs) +
+ return ((rvu_get_pf(pcifunc) * LMT_MAX_VFS) +
(pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE;
}
@@ -69,7 +83,7 @@ static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc,
mutex_lock(&rvu->rsrc_lock);
rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova);
- pf = rvu_get_pf(pcifunc) & 0x1F;
+ pf = rvu_get_pf(pcifunc) & RVU_PFVF_PF_MASK;
val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 |
((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF);
rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index a1f9ec03c2ce..c827da626471 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -553,6 +553,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
u64 lmt_addr, val, tbl_base;
int pf, vf, num_vfs, hw_vfs;
void __iomem *lmt_map_base;
+ int apr_pfs, apr_vfs;
int buf_size = 10240;
size_t off = 0;
int index = 0;
@@ -568,8 +569,12 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
return -ENOMEM;
tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE);
+ val = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG);
+ apr_vfs = 1 << (val & 0xF);
+ apr_pfs = 1 << ((val >> 4) & 0x7);
- lmt_map_base = ioremap_wc(tbl_base, 128 * 1024);
+ lmt_map_base = ioremap_wc(tbl_base, apr_pfs * apr_vfs *
+ LMT_MAPTBL_ENTRY_SIZE);
if (!lmt_map_base) {
dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n");
kfree(buf);
@@ -591,7 +596,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
off += scnprintf(&buf[off], buf_size - 1 - off, "PF%d \t\t\t",
pf);
- index = pf * rvu->hw->total_vfs * LMT_MAPTBL_ENTRY_SIZE;
+ index = pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE;
off += scnprintf(&buf[off], buf_size - 1 - off, " 0x%llx\t\t",
(tbl_base + index));
lmt_addr = readq(lmt_map_base + index);
@@ -604,7 +609,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
/* Reading num of VFs per PF */
rvu_get_pf_numvfs(rvu, pf, &num_vfs, &hw_vfs);
for (vf = 0; vf < num_vfs; vf++) {
- index = (pf * rvu->hw->total_vfs * 16) +
+ index = (pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE) +
((vf + 1) * LMT_MAPTBL_ENTRY_SIZE);
off += scnprintf(&buf[off], buf_size - 1 - off,
"PF%d:VF%d \t\t", pf, vf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
index f3b9daffaec3..4c7e0f345cb5 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
@@ -531,7 +531,8 @@ static int cn10k_mcs_write_tx_secy(struct otx2_nic *pfvf,
if (sw_tx_sc->encrypt)
sectag_tci |= (MCS_TCI_E | MCS_TCI_C);
- policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, secy->netdev->mtu);
+ policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU,
+ pfvf->netdev->mtu + OTX2_ETH_HLEN);
/* Write SecTag excluding AN bits(1..0) */
policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_TCI, sectag_tci >> 2);
policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_OFFSET, tag_offset);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 1e88422825be..d6b4b74e4002 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -356,6 +356,7 @@ struct otx2_flow_config {
struct list_head flow_list_tc;
u8 ucast_flt_cnt;
bool ntuple;
+ u16 ntuple_cnt;
};
struct dev_hw_ops {
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
index 33ec9a7f7c03..e13ae5484c19 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
@@ -41,6 +41,7 @@ static int otx2_dl_mcam_count_set(struct devlink *devlink, u32 id,
if (!pfvf->flow_cfg)
return 0;
+ pfvf->flow_cfg->ntuple_cnt = ctx->val.vu16;
otx2_alloc_mcam_entries(pfvf, ctx->val.vu16);
return 0;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
index 010385b29988..45b8c9230184 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
@@ -315,7 +315,7 @@ static void otx2_get_pauseparam(struct net_device *netdev,
struct otx2_nic *pfvf = netdev_priv(netdev);
struct cgx_pause_frm_cfg *req, *rsp;
- if (is_otx2_lbkvf(pfvf->pdev))
+ if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev))
return;
mutex_lock(&pfvf->mbox.lock);
@@ -347,7 +347,7 @@ static int otx2_set_pauseparam(struct net_device *netdev,
if (pause->autoneg)
return -EOPNOTSUPP;
- if (is_otx2_lbkvf(pfvf->pdev))
+ if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev))
return -EOPNOTSUPP;
if (pause->rx_pause)
@@ -941,8 +941,8 @@ static u32 otx2_get_link(struct net_device *netdev)
{
struct otx2_nic *pfvf = netdev_priv(netdev);
- /* LBK link is internal and always UP */
- if (is_otx2_lbkvf(pfvf->pdev))
+ /* LBK and SDP links are internal and always UP */
+ if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev))
return 1;
return pfvf->linfo.link_up;
}
@@ -1413,7 +1413,7 @@ static int otx2vf_get_link_ksettings(struct net_device *netdev,
{
struct otx2_nic *pfvf = netdev_priv(netdev);
- if (is_otx2_lbkvf(pfvf->pdev)) {
+ if (is_otx2_lbkvf(pfvf->pdev) || is_otx2_sdp_rep(pfvf->pdev)) {
cmd->base.duplex = DUPLEX_FULL;
cmd->base.speed = SPEED_100000;
} else {
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
index 47bfd1fb37d4..64c6d9162ef6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
@@ -247,7 +247,7 @@ int otx2_mcam_entry_init(struct otx2_nic *pfvf)
mutex_unlock(&pfvf->mbox.lock);
/* Allocate entries for Ntuple filters */
- count = otx2_alloc_mcam_entries(pfvf, OTX2_DEFAULT_FLOWCOUNT);
+ count = otx2_alloc_mcam_entries(pfvf, flow_cfg->ntuple_cnt);
if (count <= 0) {
otx2_clear_ntuple_flow_info(pfvf, flow_cfg);
return 0;
@@ -307,6 +307,7 @@ int otx2_mcam_flow_init(struct otx2_nic *pf)
INIT_LIST_HEAD(&pf->flow_cfg->flow_list_tc);
pf->flow_cfg->ucast_flt_cnt = OTX2_DEFAULT_UNICAST_FLOWS;
+ pf->flow_cfg->ntuple_cnt = OTX2_DEFAULT_FLOWCOUNT;
/* Allocate bare minimum number of MCAM entries needed for
* unicast and ntuple filters.
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
index 7ef3ba477d49..9b28be4c4a5d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
@@ -729,9 +729,12 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
}
#ifdef CONFIG_DCB
- err = otx2_dcbnl_set_ops(netdev);
- if (err)
- goto err_free_zc_bmap;
+ /* Priority flow control is not supported for LBK and SDP vf(s) */
+ if (!(is_otx2_lbkvf(vf->pdev) || is_otx2_sdp_rep(vf->pdev))) {
+ err = otx2_dcbnl_set_ops(netdev);
+ if (err)
+ goto err_free_zc_bmap;
+ }
#endif
otx2_qos_init(vf, qos_txqs);
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 22a532695fb0..6c92072b4c28 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -4748,7 +4748,7 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
}
if (mtk_is_netsys_v3_or_greater(mac->hw) &&
- MTK_HAS_CAPS(mac->hw->soc->caps, MTK_ESW_BIT) &&
+ MTK_HAS_CAPS(mac->hw->soc->caps, MTK_ESW) &&
id == MTK_GMAC1_ID) {
mac->phylink_config.mac_capabilities = MAC_ASYM_PAUSE |
MAC_SYM_PAUSE |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3506024c2453..9bd166f489e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4349,6 +4349,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev
if (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
netdev_warn(netdev, "Disabling HW_VLAN CTAG FILTERING, not supported in switchdev mode\n");
+ features &= ~NETIF_F_HW_MACSEC;
+ if (netdev->features & NETIF_F_HW_MACSEC)
+ netdev_warn(netdev, "Disabling HW MACsec offload, not supported in switchdev mode\n");
+
return features;
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 464821dd492d..a2033837182e 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3014,6 +3014,9 @@ static int mlxsw_sp_neigh_rif_made_sync(struct mlxsw_sp *mlxsw_sp,
.rif = rif,
};
+ if (!mlxsw_sp_dev_lower_is_port(mlxsw_sp_rif_dev(rif)))
+ return 0;
+
neigh_for_each(&arp_tbl, mlxsw_sp_neigh_rif_made_sync_each, &rms);
if (rms.err)
goto err_arp;
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c
index e2d6bfb5d693..a70b88037a20 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -3495,6 +3495,7 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter,
struct pci_dev *pdev)
{
struct lan743x_tx *tx;
+ u32 sgmii_ctl;
int index;
int ret;
@@ -3507,6 +3508,15 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter,
spin_lock_init(&adapter->eth_syslock_spinlock);
mutex_init(&adapter->sgmii_rw_lock);
pci11x1x_set_rfe_rd_fifo_threshold(adapter);
+ sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL);
+ if (adapter->is_sgmii_en) {
+ sgmii_ctl |= SGMII_CTL_SGMII_ENABLE_;
+ sgmii_ctl &= ~SGMII_CTL_SGMII_POWER_DN_;
+ } else {
+ sgmii_ctl &= ~SGMII_CTL_SGMII_ENABLE_;
+ sgmii_ctl |= SGMII_CTL_SGMII_POWER_DN_;
+ }
+ lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl);
} else {
adapter->max_tx_channels = LAN743X_MAX_TX_CHANNELS;
adapter->used_tx_channels = LAN743X_USED_TX_CHANNELS;
@@ -3558,7 +3568,6 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter,
static int lan743x_mdiobus_init(struct lan743x_adapter *adapter)
{
- u32 sgmii_ctl;
int ret;
adapter->mdiobus = devm_mdiobus_alloc(&adapter->pdev->dev);
@@ -3570,10 +3579,6 @@ static int lan743x_mdiobus_init(struct lan743x_adapter *adapter)
adapter->mdiobus->priv = (void *)adapter;
if (adapter->is_pci11x1x) {
if (adapter->is_sgmii_en) {
- sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL);
- sgmii_ctl |= SGMII_CTL_SGMII_ENABLE_;
- sgmii_ctl &= ~SGMII_CTL_SGMII_POWER_DN_;
- lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl);
netif_dbg(adapter, drv, adapter->netdev,
"SGMII operation\n");
adapter->mdiobus->read = lan743x_mdiobus_read_c22;
@@ -3584,10 +3589,6 @@ static int lan743x_mdiobus_init(struct lan743x_adapter *adapter)
netif_dbg(adapter, drv, adapter->netdev,
"lan743x-mdiobus-c45\n");
} else {
- sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL);
- sgmii_ctl &= ~SGMII_CTL_SGMII_ENABLE_;
- sgmii_ctl |= SGMII_CTL_SGMII_POWER_DN_;
- lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl);
netif_dbg(adapter, drv, adapter->netdev,
"RGMII operation\n");
// Only C22 support when RGMII I/F
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 99df00c30b8c..b5d744d2586f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -203,7 +203,7 @@ static struct pci_driver qede_pci_driver = {
};
static struct qed_eth_cb_ops qede_ll_ops = {
- {
+ .common = {
#ifdef CONFIG_RFS_ACCEL
.arfs_filter_op = qede_arfs_filter_op,
#endif
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
index 28d24d59efb8..d57b976b9040 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c
@@ -1484,8 +1484,11 @@ static int qlcnic_sriov_channel_cfg_cmd(struct qlcnic_adapter *adapter, u8 cmd_o
}
cmd_op = (cmd.rsp.arg[0] & 0xff);
- if (cmd.rsp.arg[0] >> 25 == 2)
- return 2;
+ if (cmd.rsp.arg[0] >> 25 == 2) {
+ ret = 2;
+ goto out;
+ }
+
if (cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT)
set_bit(QLC_BC_VF_STATE, &vf->state);
else
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index 85723a78793a..6c7e8655a7eb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -964,7 +964,7 @@ static int sun8i_dwmac_set_syscon(struct device *dev,
/* of_mdio_parse_addr returns a valid (0 ~ 31) PHY
* address. No need to mask it again.
*/
- reg |= 1 << H3_EPHY_ADDR_SHIFT;
+ reg |= ret << H3_EPHY_ADDR_SHIFT;
} else {
/* For SoCs without internal PHY the PHY selection bit should be
* set to 0 (external PHY).
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index 1e6d2335293d..30665ffe78cf 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -2685,7 +2685,7 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common)
port->slave.mac_addr);
if (!is_valid_ether_addr(port->slave.mac_addr)) {
eth_random_addr(port->slave.mac_addr);
- dev_err(dev, "Use random MAC address\n");
+ dev_info(dev, "Use random MAC address\n");
}
}
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
index aed45abafb1b..490d34233d38 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c
@@ -434,14 +434,20 @@ static int wx_host_interface_command_r(struct wx *wx, u32 *buffer,
wr32m(wx, WX_SW2FW_MBOX_CMD, WX_SW2FW_MBOX_CMD_VLD, WX_SW2FW_MBOX_CMD_VLD);
/* polling reply from FW */
- err = read_poll_timeout(wx_poll_fw_reply, reply, reply, 1000, 50000,
- true, wx, buffer, send_cmd);
+ err = read_poll_timeout(wx_poll_fw_reply, reply, reply, 2000,
+ timeout * 1000, true, wx, buffer, send_cmd);
if (err) {
wx_err(wx, "Polling from FW messages timeout, cmd: 0x%x, index: %d\n",
send_cmd, wx->swfw_index);
goto rel_out;
}
+ if (hdr->cmd_or_resp.ret_status == 0x80) {
+ wx_err(wx, "Unknown FW command: 0x%x\n", send_cmd);
+ err = -EINVAL;
+ goto rel_out;
+ }
+
/* expect no reply from FW then return */
if (!return_data)
goto rel_out;
diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c
index 4b9921b7bb11..a054b259d435 100644
--- a/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c
+++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_hw.c
@@ -99,9 +99,15 @@ static int txgbe_calc_eeprom_checksum(struct wx *wx, u16 *checksum)
}
local_buffer = eeprom_ptrs;
- for (i = 0; i < TXGBE_EEPROM_LAST_WORD; i++)
+ for (i = 0; i < TXGBE_EEPROM_LAST_WORD; i++) {
+ if (wx->mac.type == wx_mac_aml) {
+ if (i >= TXGBE_EEPROM_I2C_SRART_PTR &&
+ i < TXGBE_EEPROM_I2C_END_PTR)
+ local_buffer[i] = 0xffff;
+ }
if (i != wx->eeprom.sw_region_offset + TXGBE_EEPROM_CHECKSUM)
*checksum += local_buffer[i];
+ }
kvfree(eeprom_ptrs);
diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h
index 9c1c26234cad..f423012dec22 100644
--- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h
+++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h
@@ -158,6 +158,8 @@
#define TXGBE_EEPROM_VERSION_L 0x1D
#define TXGBE_EEPROM_VERSION_H 0x1E
#define TXGBE_ISCSI_BOOT_CONFIG 0x07
+#define TXGBE_EEPROM_I2C_SRART_PTR 0x580
+#define TXGBE_EEPROM_I2C_END_PTR 0x800
#define TXGBE_MAX_MSIX_VECTORS 64
#define TXGBE_MAX_FDIR_INDICES 63
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 70f7cb383228..cb6f5482d203 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -158,7 +158,6 @@ struct hv_netvsc_packet {
u8 cp_partial; /* partial copy into send buffer */
u8 rmsg_size; /* RNDIS header and PPI size */
- u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */
u8 page_buf_cnt;
u16 q_idx;
@@ -893,6 +892,18 @@ struct nvsp_message {
sizeof(struct nvsp_message))
#define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor)
+/* Maximum # of contiguous data ranges that can make up a trasmitted packet.
+ * Typically it's the max SKB fragments plus 2 for the rndis packet and the
+ * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may
+ * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries
+ * in a GPA direct packet sent to netvsp over VMBus.
+ */
+#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT
+#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2)
+#else
+#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT
+#endif
+
/* Estimated requestor size:
* out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size
*/
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index d6f5b9ea3109..720104661d7f 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -953,8 +953,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
+ pend_size;
int i;
u32 padding = 0;
- u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt :
- packet->page_buf_cnt;
+ u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt;
u32 remain;
/* Add padding */
@@ -1055,6 +1054,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev,
return 0;
}
+/* Build an "array" of mpb entries describing the data to be transferred
+ * over VMBus. After the desc header fields, each "array" entry is variable
+ * size, and each entry starts after the end of the previous entry. The
+ * "offset" and "len" fields for each entry imply the size of the entry.
+ *
+ * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V
+ * uses that granularity, even if the system page size of the guest is larger.
+ * Each entry in the input "pb" array must describe a contiguous range of
+ * guest physical memory so that the pfns are sequential if the range crosses
+ * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE.
+ */
+static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb,
+ u32 page_buffer_count,
+ struct vmbus_packet_mpb_array *desc,
+ u32 *desc_size)
+{
+ struct hv_mpb_array *mpb_entry = &desc->range;
+ int i, j;
+
+ for (i = 0; i < page_buffer_count; i++) {
+ u32 offset = pb[i].offset;
+ u32 len = pb[i].len;
+
+ mpb_entry->offset = offset;
+ mpb_entry->len = len;
+
+ for (j = 0; j < HVPFN_UP(offset + len); j++)
+ mpb_entry->pfn_array[j] = pb[i].pfn + j;
+
+ mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j];
+ }
+
+ desc->rangecount = page_buffer_count;
+ *desc_size = (char *)mpb_entry - (char *)desc;
+}
+
static inline int netvsc_send_pkt(
struct hv_device *device,
struct hv_netvsc_packet *packet,
@@ -1097,8 +1132,11 @@ static inline int netvsc_send_pkt(
packet->dma_range = NULL;
if (packet->page_buf_cnt) {
+ struct vmbus_channel_packet_page_buffer desc;
+ u32 desc_size;
+
if (packet->cp_partial)
- pb += packet->rmsg_pgcnt;
+ pb++;
ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
if (ret) {
@@ -1106,11 +1144,12 @@ static inline int netvsc_send_pkt(
goto exit;
}
- ret = vmbus_sendpacket_pagebuffer(out_channel,
- pb, packet->page_buf_cnt,
- &nvmsg, sizeof(nvmsg),
- req_id);
-
+ netvsc_build_mpb_array(pb, packet->page_buf_cnt,
+ (struct vmbus_packet_mpb_array *)&desc,
+ &desc_size);
+ ret = vmbus_sendpacket_mpb_desc(out_channel,
+ (struct vmbus_packet_mpb_array *)&desc,
+ desc_size, &nvmsg, sizeof(nvmsg), req_id);
if (ret)
netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
} else {
@@ -1259,7 +1298,7 @@ int netvsc_send(struct net_device *ndev,
packet->send_buf_index = section_index;
if (packet->cp_partial) {
- packet->page_buf_cnt -= packet->rmsg_pgcnt;
+ packet->page_buf_cnt--;
packet->total_data_buflen = msd_len + packet->rmsg_size;
} else {
packet->page_buf_cnt = 0;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index c51b318b8a72..d8b169ac0343 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -326,43 +326,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
return txq;
}
-static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len,
- struct hv_page_buffer *pb)
-{
- int j = 0;
-
- hvpfn += offset >> HV_HYP_PAGE_SHIFT;
- offset = offset & ~HV_HYP_PAGE_MASK;
-
- while (len > 0) {
- unsigned long bytes;
-
- bytes = HV_HYP_PAGE_SIZE - offset;
- if (bytes > len)
- bytes = len;
- pb[j].pfn = hvpfn;
- pb[j].offset = offset;
- pb[j].len = bytes;
-
- offset += bytes;
- len -= bytes;
-
- if (offset == HV_HYP_PAGE_SIZE && len) {
- hvpfn++;
- offset = 0;
- j++;
- }
- }
-
- return j + 1;
-}
-
static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
struct hv_netvsc_packet *packet,
struct hv_page_buffer *pb)
{
- u32 slots_used = 0;
- char *data = skb->data;
int frags = skb_shinfo(skb)->nr_frags;
int i;
@@ -371,28 +338,27 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
* 2. skb linear data
* 3. skb fragment data
*/
- slots_used += fill_pg_buf(virt_to_hvpfn(hdr),
- offset_in_hvpage(hdr),
- len,
- &pb[slots_used]);
+ pb[0].offset = offset_in_hvpage(hdr);
+ pb[0].len = len;
+ pb[0].pfn = virt_to_hvpfn(hdr);
packet->rmsg_size = len;
- packet->rmsg_pgcnt = slots_used;
- slots_used += fill_pg_buf(virt_to_hvpfn(data),
- offset_in_hvpage(data),
- skb_headlen(skb),
- &pb[slots_used]);
+ pb[1].offset = offset_in_hvpage(skb->data);
+ pb[1].len = skb_headlen(skb);
+ pb[1].pfn = virt_to_hvpfn(skb->data);
for (i = 0; i < frags; i++) {
skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+ struct hv_page_buffer *cur_pb = &pb[i + 2];
+ u64 pfn = page_to_hvpfn(skb_frag_page(frag));
+ u32 offset = skb_frag_off(frag);
- slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)),
- skb_frag_off(frag),
- skb_frag_size(frag),
- &pb[slots_used]);
+ cur_pb->offset = offset_in_hvpage(offset);
+ cur_pb->len = skb_frag_size(frag);
+ cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT);
}
- return slots_used;
+ return frags + 2;
}
static int count_skb_frag_slots(struct sk_buff *skb)
@@ -483,7 +449,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx)
struct net_device *vf_netdev;
u32 rndis_msg_size;
u32 hash;
- struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT];
+ struct hv_page_buffer pb[MAX_DATA_RANGES];
/* If VF is present and up then redirect packets to it.
* Skip the VF if it is marked down or has no carrier.
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 82747dfacd70..9e73959e61ee 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev,
struct rndis_request *req)
{
struct hv_netvsc_packet *packet;
- struct hv_page_buffer page_buf[2];
- struct hv_page_buffer *pb = page_buf;
+ struct hv_page_buffer pb;
int ret;
/* Setup the packet to send it */
@@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev,
packet->total_data_buflen = req->request_msg.msg_len;
packet->page_buf_cnt = 1;
- pb[0].pfn = virt_to_phys(&req->request_msg) >>
- HV_HYP_PAGE_SHIFT;
- pb[0].len = req->request_msg.msg_len;
- pb[0].offset = offset_in_hvpage(&req->request_msg);
-
- /* Add one page_buf when request_msg crossing page boundary */
- if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) {
- packet->page_buf_cnt++;
- pb[0].len = HV_HYP_PAGE_SIZE -
- pb[0].offset;
- pb[1].pfn = virt_to_phys((void *)&req->request_msg
- + pb[0].len) >> HV_HYP_PAGE_SHIFT;
- pb[1].offset = 0;
- pb[1].len = req->request_msg.msg_len -
- pb[0].len;
- }
+ pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT;
+ pb.len = req->request_msg.msg_len;
+ pb.offset = offset_in_hvpage(&req->request_msg);
trace_rndis_send(dev->ndev, 0, &req->request_msg);
rcu_read_lock_bh();
- ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false);
+ ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false);
rcu_read_unlock_bh();
return ret;
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 24882d30f685..e2c6569d8c45 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -2027,12 +2027,6 @@ static int ksz9477_config_init(struct phy_device *phydev)
return err;
}
- /* According to KSZ9477 Errata DS80000754C (Module 4) all EEE modes
- * in this switch shall be regarded as broken.
- */
- if (phydev->dev_flags & MICREL_NO_EEE)
- phy_disable_eee(phydev);
-
return kszphy_config_init(phydev);
}
@@ -5705,7 +5699,6 @@ static struct phy_driver ksphy_driver[] = {
.handle_interrupt = kszphy_handle_interrupt,
.suspend = genphy_suspend,
.resume = ksz9477_resume,
- .get_features = ksz9477_get_features,
} };
module_phy_driver(ksphy_driver);
diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c
index d8fc0c79745d..b75ceb90359f 100644
--- a/drivers/net/team/team_core.c
+++ b/drivers/net/team/team_core.c
@@ -1778,8 +1778,8 @@ static void team_change_rx_flags(struct net_device *dev, int change)
struct team_port *port;
int inc;
- rcu_read_lock();
- list_for_each_entry_rcu(port, &team->port_list, list) {
+ mutex_lock(&team->lock);
+ list_for_each_entry(port, &team->port_list, list) {
if (change & IFF_PROMISC) {
inc = dev->flags & IFF_PROMISC ? 1 : -1;
dev_set_promiscuity(port->dev, inc);
@@ -1789,7 +1789,7 @@ static void team_change_rx_flags(struct net_device *dev, int change)
dev_set_allmulti(port->dev, inc);
}
}
- rcu_read_unlock();
+ mutex_unlock(&team->lock);
}
static void team_set_rx_mode(struct net_device *dev)
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 3df6aabc7e33..c676979c7ab9 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -3607,8 +3607,6 @@ vmxnet3_change_mtu(struct net_device *netdev, int new_mtu)
struct vmxnet3_adapter *adapter = netdev_priv(netdev);
int err = 0;
- WRITE_ONCE(netdev->mtu, new_mtu);
-
/*
* Reset_work may be in the middle of resetting the device, wait for its
* completion.
@@ -3622,6 +3620,7 @@ vmxnet3_change_mtu(struct net_device *netdev, int new_mtu)
/* we need to re-create the rx queue based on the new mtu */
vmxnet3_rq_destroy_all(adapter);
+ WRITE_ONCE(netdev->mtu, new_mtu);
vmxnet3_adjust_rx_ring_size(adapter);
err = vmxnet3_rq_create_all(adapter);
if (err) {
@@ -3638,6 +3637,8 @@ vmxnet3_change_mtu(struct net_device *netdev, int new_mtu)
"Closing it\n", err);
goto out;
}
+ } else {
+ WRITE_ONCE(netdev->mtu, new_mtu);
}
out:
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 844af16ee551..35b4ec91979e 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -1011,6 +1011,7 @@ void mt76_dma_cleanup(struct mt76_dev *dev)
int i;
mt76_worker_disable(&dev->tx_worker);
+ napi_disable(&dev->tx_napi);
netif_napi_del(&dev->tx_napi);
for (i = 0; i < ARRAY_SIZE(dev->phys); i++) {
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
index e61da76b2097..14b1f603fb62 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/mcu.c
@@ -1924,14 +1924,14 @@ mt7925_mcu_sta_cmd(struct mt76_phy *phy,
mt7925_mcu_sta_mld_tlv(skb, info->vif, info->link_sta->sta);
mt7925_mcu_sta_eht_mld_tlv(skb, info->vif, info->link_sta->sta);
}
-
- mt7925_mcu_sta_hdr_trans_tlv(skb, info->vif, info->link_sta);
}
if (!info->enable) {
mt7925_mcu_sta_remove_tlv(skb);
mt76_connac_mcu_add_tlv(skb, STA_REC_MLD_OFF,
sizeof(struct tlv));
+ } else {
+ mt7925_mcu_sta_hdr_trans_tlv(skb, info->vif, info->link_sta);
}
return mt76_mcu_skb_send_msg(dev, skb, info->cmd, true);
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 2c092ec8c0a9..3b6d759bcdf2 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
{
const char *hmac_name;
struct crypto_shash *key_tfm;
- struct shash_desc *shash;
+ SHASH_DESC_ON_STACK(shash, key_tfm);
struct nvme_dhchap_key *transformed_key;
int ret, key_len;
@@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (IS_ERR(key_tfm))
return ERR_CAST(key_tfm);
- shash = kmalloc(sizeof(struct shash_desc) +
- crypto_shash_descsize(key_tfm),
- GFP_KERNEL);
- if (!shash) {
- ret = -ENOMEM;
- goto out_free_key;
- }
-
key_len = crypto_shash_digestsize(key_tfm);
transformed_key = nvme_auth_alloc_key(key_len, key->hash);
if (!transformed_key) {
ret = -ENOMEM;
- goto out_free_shash;
+ goto out_free_key;
}
shash->tfm = key_tfm;
@@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (ret < 0)
goto out_free_transformed_key;
- kfree(shash);
crypto_free_shash(key_tfm);
return transformed_key;
out_free_transformed_key:
nvme_auth_free_key(transformed_key);
-out_free_shash:
- kfree(shash);
out_free_key:
crypto_free_shash(key_tfm);
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 6115fef74c1e..f6ddbe553289 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
u32 s1;
u32 s2;
bool bi_directional;
+ bool authenticated;
u16 transaction;
u8 status;
u8 dhgroup_id;
@@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
nvme_auth_reset_dhchap(chap);
+ chap->authenticated = false;
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
@@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
if (!ret) {
chap->error = 0;
+ chap->authenticated = true;
if (ctrl->opts->concat &&
(ret = nvme_auth_secure_concat(ctrl, chap))) {
dev_warn(ctrl->device,
"%s: qid %d failed to enable secure concatenation\n",
__func__, chap->qid);
chap->error = ret;
+ chap->authenticated = false;
}
return;
}
@@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
return;
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_negotiate(ctrl, q);
- if (ret) {
- dev_warn(ctrl->device,
- "qid %d: error %d setting up authentication\n",
- q, ret);
- break;
- }
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ /*
+ * Skip re-authentication if the queue had
+ * not been authenticated initially.
+ */
+ if (!chap->authenticated)
+ continue;
+ cancel_work_sync(&chap->auth_work);
+ queue_work(nvme_auth_wq, &chap->auth_work);
}
/*
@@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
* the controller terminates the connection.
*/
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_wait(ctrl, q);
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ if (!chap->authenticated)
+ continue;
+ flush_work(&chap->auth_work);
+ ret = chap->error;
+ nvme_auth_reset_dhchap(chap);
if (ret)
dev_warn(ctrl->device,
"qid %d: authentication failed\n", q);
@@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
chap = &ctrl->dhchap_ctxs[i];
chap->qid = i;
chap->ctrl = ctrl;
+ chap->authenticated = false;
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ac53629fce68..f69a232a000a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -38,6 +38,8 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
+ u16 endgid;
+ u64 runs;
bool is_shared;
bool is_readonly;
bool is_ready;
@@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -664,10 +668,11 @@ static void nvme_free_ns_head(struct kref *ref)
struct nvme_ns_head *head =
container_of(ref, struct nvme_ns_head, ref);
- nvme_mpath_remove_disk(head);
+ nvme_mpath_put_disk(head);
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
+ kfree(head->plids);
kfree(head);
}
@@ -991,6 +996,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (op == nvme_cmd_write && ns->head->nr_plids) {
+ u16 write_stream = req->bio->bi_write_stream;
+
+ if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+ return BLK_STS_INVAL;
+
+ if (write_stream) {
+ dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+ control |= NVME_RW_DTYPE_DPLCMT;
+ }
+ }
+
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@@ -1157,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (buffer && bufflen) {
- ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
@@ -1609,6 +1626,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
+ info->endgid = le16_to_cpu(id->endgid);
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
@@ -1649,6 +1667,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
+ info->endgid = le16_to_cpu(id->endgid);
}
kfree(id);
return ret;
@@ -1674,7 +1693,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@@ -1683,7 +1702,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
@@ -2059,7 +2078,21 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ns->ctrl->awupf) * bs;
+
+ /*
+ * Set subsystem atomic bs.
+ */
+ if (ns->ctrl->subsys->atomic_bs) {
+ if (atomic_bs != ns->ctrl->subsys->atomic_bs) {
+ dev_err_ratelimited(ns->ctrl->device,
+ "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n",
+ ns->disk ? ns->disk->disk_name : "?",
+ ns->ctrl->subsys->atomic_bs,
+ atomic_bs);
+ }
+ } else
+ ns->ctrl->subsys->atomic_bs = atomic_bs;
nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
}
@@ -2153,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
+static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info, u8 fdp_idx)
+{
+ struct nvme_fdp_config_log hdr, *h;
+ struct nvme_fdp_config_desc *desc;
+ size_t size = sizeof(hdr);
+ void *log, *end;
+ int i, n, ret;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, &hdr, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log header status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ return ret;
+ }
+
+ size = le32_to_cpu(hdr.sze);
+ if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
+ dev_warn(ctrl->device, "FDP config size too large:%zu\n",
+ size);
+ return 0;
+ }
+
+ h = kvmalloc(size, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, h, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ goto out;
+ }
+
+ n = le16_to_cpu(h->numfdpc) + 1;
+ if (fdp_idx > n) {
+ dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
+ fdp_idx, n);
+ /* Proceed without registering FDP streams */
+ ret = 0;
+ goto out;
+ }
+
+ log = h + 1;
+ desc = log;
+ end = log + size - sizeof(*h);
+ for (i = 0; i < fdp_idx; i++) {
+ log += le16_to_cpu(desc->dsze);
+ desc = log;
+ if (log >= end) {
+ dev_warn(ctrl->device,
+ "FDP invalid config descriptor list\n");
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (le32_to_cpu(desc->nrg) > 1) {
+ dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
+ ret = 0;
+ goto out;
+ }
+
+ info->runs = le64_to_cpu(desc->runs);
+out:
+ kvfree(h);
+ return ret;
+}
+
+static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ struct nvme_ns_head *head = ns->head;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_fdp_ruh_status *ruhs;
+ struct nvme_fdp_config fdp;
+ struct nvme_command c = {};
+ size_t size;
+ int i, ret;
+
+ /*
+ * The FDP configuration is static for the lifetime of the namespace,
+ * so return immediately if we've already registered this namespace's
+ * streams.
+ */
+ if (head->nr_plids)
+ return 0;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
+ &fdp);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
+ return ret;
+ }
+
+ if (!(fdp.flags & FDPCFG_FDPE))
+ return 0;
+
+ ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
+ if (!info->runs)
+ return ret;
+
+ size = struct_size(ruhs, ruhsd, S8_MAX - 1);
+ ruhs = kzalloc(size, GFP_KERNEL);
+ if (!ruhs)
+ return -ENOMEM;
+
+ c.imr.opcode = nvme_cmd_io_mgmt_recv;
+ c.imr.nsid = cpu_to_le32(head->ns_id);
+ c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
+ c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
+ ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
+ goto free;
+ }
+
+ head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+ if (!head->nr_plids)
+ goto free;
+
+ head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
+ GFP_KERNEL);
+ if (!head->plids) {
+ dev_warn(ctrl->device,
+ "failed to allocate %u FDP placement IDs\n",
+ head->nr_plids);
+ head->nr_plids = 0;
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < head->nr_plids; i++)
+ head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
+free:
+ kfree(ruhs);
+ return ret;
+}
+
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@@ -2190,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+ ret = nvme_query_fdp_info(ns, info);
+ if (ret < 0)
+ goto out;
+ }
+
lim = queue_limits_start_update(ns->disk->queue);
memflags = blk_mq_freeze_queue(ns->disk->queue);
@@ -2201,6 +2382,17 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
nvme_set_chunk_sectors(ns, id, &lim);
if (!nvme_update_disk_info(ns, id, &lim))
capacity = 0;
+
+ /*
+ * Validate the max atomic write size fits within the subsystem's
+ * atomic write capabilities.
+ */
+ if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) {
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
+ ret = -ENXIO;
+ goto out;
+ }
+
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
@@ -2223,6 +2415,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
+ lim.max_write_streams = ns->head->nr_plids;
+ if (lim.max_write_streams)
+ lim.write_stream_granularity = min(info->runs, U32_MAX);
+ else
+ lim.write_stream_granularity = 0;
+
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
@@ -2326,6 +2524,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
+ lim.max_write_streams = ns_lim->max_write_streams;
+ lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
@@ -3031,7 +3231,6 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
kfree(subsys);
return -EINVAL;
}
- subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
subsys->dev.class = &nvme_subsys_class;
@@ -3084,8 +3283,8 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
- void *log, size_t size, u64 offset)
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@@ -3099,10 +3298,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
+ c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+ void *log, size_t size, u64 offset)
+{
+ return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
+ offset, 0);
+}
+
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
@@ -3441,7 +3648,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
-
+ ctrl->awupf = le16_to_cpu(id->awupf);
out_free:
kfree(id);
return ret;
@@ -3560,7 +3767,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
*/
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
- if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+ if (nvme_tryget_ns_head(h))
return h;
}
@@ -3804,7 +4011,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
}
} else {
ret = -EINVAL;
- if (!info->is_shared || !head->shared) {
+ if ((!info->is_shared || !head->shared) &&
+ !list_empty(&head->list)) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n",
info->nsid);
@@ -4008,7 +4216,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
if (list_empty(&ns->head->list)) {
- list_del_init(&ns->head->entry);
+ if (!nvme_mpath_queue_if_no_path(ns->head))
+ list_del_init(&ns->head->entry);
last_path = true;
}
mutex_unlock(&ns->ctrl->subsys->lock);
@@ -4029,7 +4238,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
- nvme_mpath_shutdown_disk(ns->head);
+ nvme_mpath_remove_disk(ns->head);
nvme_put_ns(ns);
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 2257c3c96dd2..fdafa3e9e66f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
}
static void
-nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
{
- struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
struct nvme_fc_rport *rport = lsop->rport;
struct nvme_fc_lport *lport = rport->lport;
unsigned long flags;
@@ -1434,6 +1433,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
}
static void
+nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+{
+ struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
+
+ nvme_fc_xmt_ls_rsp_free(lsop);
+}
+
+static void
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
{
struct nvme_fc_rport *rport = lsop->rport;
@@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
dev_warn(lport->dev,
"LLDD rejected LS RSP xmt: LS %d status %d\n",
w0->ls_cmd, ret);
- nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
+ nvme_fc_xmt_ls_rsp_free(lsop);
return;
}
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 250f3da67cc9..878ea8b1a0ac 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -10,10 +10,61 @@
#include "nvme.h"
bool multipath = true;
-module_param(multipath, bool, 0444);
+static bool multipath_always_on;
+
+static int multipath_param_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret)
+ return ret;
+
+ if (multipath_always_on && !*arg) {
+ pr_err("Can't disable multipath when multipath_always_on is configured.\n");
+ *arg = true;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_param_ops = {
+ .set = multipath_param_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+static int multipath_always_on_set(const char *val,
+ const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret < 0)
+ return ret;
+
+ if (*arg)
+ multipath = true;
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_always_on_ops = {
+ .set = multipath_always_on_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath_always_on, &multipath_always_on_ops,
+ &multipath_always_on, 0444);
+MODULE_PARM_DESC(multipath_always_on,
+ "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
+
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
@@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
break;
}
}
- return false;
+
+ /*
+ * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
+ * not immediately fail I/O. Instead, requeue the I/O for the configured
+ * duration, anticipating that if there's a transient link failure then
+ * it may recover within this time window. This parameter is exported to
+ * userspace via sysfs, and its default value is zero. It is internally
+ * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
+ * non-zero, this flag is set to true. When zero, the flag is cleared.
+ */
+ return nvme_mpath_queue_if_no_path(head);
}
static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
+static void nvme_remove_head(struct nvme_ns_head *head)
+{
+ if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ /*
+ * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
+ * to allow multipath to fail all I/O.
+ */
+ kblockd_schedule_work(&head->requeue_work);
+
+ nvme_cdev_del(&head->cdev, &head->cdev_device);
+ synchronize_srcu(&head->srcu);
+ del_gendisk(head->disk);
+ nvme_put_ns_head(head);
+ }
+}
+
+static void nvme_remove_head_work(struct work_struct *work)
+{
+ struct nvme_ns_head *head = container_of(to_delayed_work(work),
+ struct nvme_ns_head, remove_work);
+ bool remove = false;
+
+ mutex_lock(&head->subsys->lock);
+ if (list_empty(&head->list)) {
+ list_del_init(&head->entry);
+ remove = true;
+ }
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
+
+ module_put(THIS_MODULE);
+}
+
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
@@ -626,19 +721,31 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
+ INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
+ head->delayed_removal_secs = 0;
/*
- * Add a multipath node if the subsystems supports multiple controllers.
- * We also do this for private namespaces as the namespace sharing flag
- * could change after a rescan.
+ * If "multipath_always_on" is enabled, a multipath node is added
+ * regardless of whether the disk is single/multi ported, and whether
+ * the namespace is shared or private. If "multipath_always_on" is not
+ * enabled, a multipath node is added only if the subsystem supports
+ * multiple controllers and the "multipath" option is configured. In
+ * either case, for private namespaces, we ensure that the NSID is
+ * unique.
*/
- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
- !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ if (!multipath_always_on) {
+ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
+ !multipath)
+ return 0;
+ }
+
+ if (!nvme_is_unique_nsid(ctrl, head))
return 0;
blk_set_stacking_limits(&lim);
lim.dma_alignment = 3;
- lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
+ lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT |
+ BLK_FEAT_POLL | BLK_FEAT_ATOMIC_WRITES;
if (head->ids.csi == NVME_CSI_ZNS)
lim.features |= BLK_FEAT_ZONED;
@@ -659,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
+ nvme_tryget_ns_head(head);
return 0;
}
@@ -1015,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
}
DEVICE_ATTR_RO(numa_nodes);
+static ssize_t delayed_removal_secs_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ int ret;
+
+ mutex_lock(&head->subsys->lock);
+ ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
+ mutex_unlock(&head->subsys->lock);
+ return ret;
+}
+
+static ssize_t delayed_removal_secs_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ unsigned int sec;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &sec);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&head->subsys->lock);
+ head->delayed_removal_secs = sec;
+ if (sec)
+ set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ else
+ clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ mutex_unlock(&head->subsys->lock);
+ /*
+ * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
+ * by its reader.
+ */
+ synchronize_srcu(&head->srcu);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(delayed_removal_secs);
+
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
@@ -1136,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
#endif
}
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
- if (!head->disk)
- return;
- if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
- nvme_cdev_del(&head->cdev, &head->cdev_device);
+ bool remove = false;
+
+ mutex_lock(&head->subsys->lock);
+ /*
+ * We are called when all paths have been removed, and at that point
+ * head->list is expected to be empty. However, nvme_remove_ns() and
+ * nvme_init_ns_head() can run concurrently and so if head->delayed_
+ * removal_secs is configured, it is possible that by the time we reach
+ * this point, head->list may no longer be empty. Therefore, we recheck
+ * head->list here. If it is no longer empty then we skip enqueuing the
+ * delayed head removal work.
+ */
+ if (!list_empty(&head->list))
+ goto out;
+
+ if (head->delayed_removal_secs) {
/*
- * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
- * to allow multipath to fail all I/O.
+ * Ensure that no one could remove this module while the head
+ * remove work is pending.
*/
- synchronize_srcu(&head->srcu);
- kblockd_schedule_work(&head->requeue_work);
- del_gendisk(head->disk);
+ if (!try_module_get(THIS_MODULE))
+ goto out;
+ queue_delayed_work(nvme_wq, &head->remove_work,
+ head->delayed_removal_secs * HZ);
+ } else {
+ list_del_init(&head->entry);
+ remove = true;
}
+out:
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
}
-void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 51e078642127..ad0c1f834f09 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -410,6 +410,7 @@ struct nvme_ctrl {
enum nvme_ctrl_type cntrltype;
enum nvme_dctype dctype;
+ u16 awupf; /* 0's based value. */
};
static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@@ -442,11 +443,11 @@ struct nvme_subsystem {
u8 cmic;
enum nvme_subsys_type subtype;
u16 vendor_id;
- u16 awupf; /* 0's based awupf value. */
struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
+ u32 atomic_bs;
};
/*
@@ -496,6 +497,9 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
+
+ u16 nr_plids;
+ u16 *plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
@@ -503,7 +507,10 @@ struct nvme_ns_head {
struct work_struct partition_scan_work;
struct mutex lock;
unsigned long flags;
-#define NVME_NSHEAD_DISK_LIVE 0
+ struct delayed_work remove_work;
+ unsigned int delayed_removal_secs;
+#define NVME_NSHEAD_DISK_LIVE 0
+#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
struct nvme_ns __rcu *current_path[];
#endif
};
@@ -896,10 +903,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@@ -960,7 +967,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
-void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+void nvme_mpath_put_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_mpath_update(struct nvme_ctrl *ctrl);
@@ -969,7 +976,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
void nvme_mpath_end_request(struct request *rq);
@@ -986,12 +993,19 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute dev_attr_queue_depth;
extern struct device_attribute dev_attr_numa_nodes;
+extern struct device_attribute dev_attr_delayed_removal_secs;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return disk->fops == &nvme_ns_head_ops;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
+ return true;
+ return false;
+}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1012,7 +1026,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
-static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
@@ -1031,7 +1045,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
}
-static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_trace_bio_complete(struct request *req)
@@ -1079,6 +1093,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return false;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ return false;
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2e30e9be7408..e0bfe04a2bc2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@@ -34,16 +35,31 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
-#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+/* Optimisation for I/Os between 4k and 128k */
+#define NVME_SMALL_POOL_SIZE 256
/*
* These can be higher, but we need to ensure that any command doesn't
* require an sg allocation that needs more than a page of data.
*/
#define NVME_MAX_KB_SZ 8192
-#define NVME_MAX_SEGS 128
-#define NVME_MAX_META_SEGS 15
-#define NVME_MAX_NR_ALLOCATIONS 5
+#define NVME_MAX_NR_DESCRIPTORS 5
+
+/*
+ * For data SGLs we support a single descriptors worth of SGL entries, but for
+ * now we also limit it to avoid an allocation larger than PAGE_SIZE for the
+ * scatterlist.
+ */
+#define NVME_MAX_SEGS \
+ min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
+ (PAGE_SIZE / sizeof(struct scatterlist)))
+
+/*
+ * For metadata SGLs, only the small descriptor is supported, and the first
+ * entry is the segment descriptor, which for the data pointer sits in the SQE.
+ */
+#define NVME_MAX_META_SEGS \
+ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
+struct nvme_descriptor_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -121,8 +142,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -162,6 +181,7 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -219,30 +240,30 @@ struct nvme_queue {
struct completion delete_done;
};
-union nvme_descriptor {
- struct nvme_sgl_desc *sg_list;
- __le64 *prp_list;
+/* bits for iod->flags */
+enum nvme_iod_flags {
+ /* this command has been aborted by the timeout handler */
+ IOD_ABORTED = 1U << 0,
+
+ /* uses the small descriptor pool */
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
};
/*
* The nvme_iod describes the data in an I/O.
- *
- * The sg pointer contains the list of PRP/SGL chunk allocations in addition
- * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- bool aborted;
- s8 nr_allocations; /* PRP list pool allocations. 0 means small
- pool in use */
+ u8 flags;
+ u8 nr_descriptors;
unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
struct sg_table meta_sgt;
- union nvme_descriptor meta_list;
- union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
+ struct nvme_sgl_desc *meta_descriptor;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -390,37 +411,85 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
-static int nvme_pci_npages_prp(void)
+static __always_inline int nvme_pci_npages_prp(void)
{
unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
-static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static struct nvme_descriptor_pools *
+nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
- struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ size_t small_align = NVME_SMALL_POOL_SIZE;
- WARN_ON(hctx_idx != 0);
- WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+ if (pools->small)
+ return pools; /* already initialized */
- hctx->driver_data = nvmeq;
- return 0;
+ pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
+ NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
+ if (!pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
+ NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
+ if (!pools->small) {
+ dma_pool_destroy(pools->large);
+ pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return pools;
}
-static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static void nvme_release_descriptor_pools(struct nvme_dev *dev)
+{
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
+
+ dma_pool_destroy(pools->large);
+ dma_pool_destroy(pools->small);
+ }
+}
+
+static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_descriptor_pools *pools;
+ struct blk_mq_tags *tags;
+
+ tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
+ WARN_ON(tags != hctx->tags);
+ pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
+ if (IS_ERR(pools))
+ return PTR_ERR(pools);
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+ nvmeq->descriptor_pools = *pools;
hctx->driver_data = nvmeq;
return 0;
}
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ WARN_ON(hctx_idx != 0);
+ return nvme_init_hctx_common(hctx, data, 0);
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
+}
+
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
@@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return true;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
+ struct nvme_iod *iod)
+{
+ if (iod->flags & IOD_SMALL_DESCRIPTOR)
+ return nvmeq->descriptor_pools.small;
+ return nvmeq->descriptor_pools.large;
+}
+
+static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
- for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = iod->list[i].prp_list;
+ if (iod->nr_descriptors == 1) {
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
+ dma_addr);
+ return;
+ }
+
+ for (i = 0; i < iod->nr_descriptors; i++) {
+ __le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
+ dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->sgt.nents);
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-
- if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
- iod->first_dma);
- else if (iod->nr_allocations == 1)
- dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
- iod->first_dma);
- else
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
}
@@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
struct scatterlist *sg = iod->sgt.sgl;
int dma_len = sg_dma_len(sg);
@@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
dma_addr_t prp_dma;
- int nprps, i;
+ int i;
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
@@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list) {
- iod->nr_allocations = -1;
+ prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &prp_dma);
+ if (!prp_list)
return BLK_STS_RESOURCE;
- }
- iod->list[0].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_dma);
if (!prp_list)
goto free_prps;
- iod->list[iod->nr_allocations++].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1;
@@ -673,7 +745,7 @@ done:
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
free_prps:
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
return BLK_STS_RESOURCE;
bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
@@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sgt.sgl;
unsigned int entries = iod->sgt.nents;
@@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list) {
- iod->nr_allocations = -1;
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- }
-
- iod->list[0].sg_list = sg_list;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int rc;
if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
}
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap_sg;
return BLK_STS_OK;
@@ -842,6 +906,7 @@ out_free_sg:
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
@@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
if (rc)
goto out_free_sg;
- sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
+ &sgl_dma);
if (!sg_list)
goto out_unmap_sg;
entries = iod->meta_sgt.nents;
- iod->meta_list.sg_list = sg_list;
+ iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
cmnd->flags = NVME_CMD_SGL_METASEG;
@@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
{
- if (nvme_pci_metadata_use_sgls(dev, req))
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(dev, req))
return nvme_pci_setup_meta_sgls(dev, req);
return nvme_pci_setup_meta_mptr(dev, req);
}
@@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- iod->aborted = false;
- iod->nr_allocations = -1;
+ iod->flags = 0;
+ iod->nr_descriptors = 0;
iod->sgt.nents = 0;
iod->meta_sgt.nents = 0;
@@ -947,7 +1016,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -1037,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
}
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq,
struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1048,8 +1118,8 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
return;
}
- dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
- iod->meta_dma);
+ dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
+ iod->meta_dma);
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
@@ -1060,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_dev *dev = nvmeq->dev;
if (blk_integrity_rq(req))
- nvme_unmap_metadata(dev, req);
+ nvme_unmap_metadata(dev, nvmeq, req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, nvmeq, req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1202,7 +1272,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ spin_lock(&nvmeq->cq_poll_lock);
nvme_poll_cq(nvmeq, NULL);
+ spin_unlock(&nvmeq->cq_poll_lock);
enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
}
@@ -1488,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode;
- if (!nvmeq->qid || iod->aborted) {
+ if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
@@ -1501,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
- iod->aborted = true;
+ iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
@@ -2840,35 +2912,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
- size_t small_align = 256;
-
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
-
- if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
- small_align = 512;
-
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, small_align, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
- return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
@@ -3183,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
+ GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@@ -3258,13 +3302,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_uninit_ctrl;
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto out_dev_unmap;
-
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
- goto out_release_prp_pools;
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
@@ -3340,8 +3380,6 @@ out_disable:
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
-out_release_prp_pools:
- nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@@ -3406,7 +3444,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
- nvme_release_prp_pools(dev);
+ nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@@ -3737,6 +3775,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
{ PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
{ PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */
@@ -3805,9 +3845,7 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
- BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
- BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
+ BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
return pci_register_driver(&nvme_driver);
}
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 6d31226f7a4f..29430949ce2f 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
&dev_attr_ana_state.attr,
&dev_attr_queue_depth.attr,
&dev_attr_numa_nodes.attr,
+ &dev_attr_delayed_removal_secs.attr,
#endif
&dev_attr_io_passthru_err_log_enabled.attr,
NULL,
@@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
}
+ if (a == &dev_attr_delayed_removal_secs.attr) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!nvme_disk_is_ns_head(disk))
+ return 0;
+ }
#endif
return a->mode;
}
@@ -306,13 +313,41 @@ static const struct attribute_group nvme_ns_attr_group = {
};
#ifdef CONFIG_NVME_MULTIPATH
+/*
+ * NOTE: The dummy attribute does not appear in sysfs. It exists solely to allow
+ * control over the visibility of the multipath sysfs node. Without at least one
+ * attribute defined in nvme_ns_mpath_attrs[], the sysfs implementation does not
+ * invoke the multipath_sysfs_group_visible() method. As a result, we would not
+ * be able to control the visibility of the multipath sysfs node.
+ */
+static struct attribute dummy_attr = {
+ .name = "dummy",
+};
+
static struct attribute *nvme_ns_mpath_attrs[] = {
+ &dummy_attr,
NULL,
};
+static bool multipath_sysfs_group_visible(struct kobject *kobj)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+
+ return nvme_disk_is_ns_head(dev_to_disk(dev));
+}
+
+static bool multipath_sysfs_attr_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ return false;
+}
+
+DEFINE_SYSFS_GROUP_VISIBLE(multipath_sysfs)
+
const struct attribute_group nvme_ns_mpath_attr_group = {
.name = "multipath",
.attrs = nvme_ns_mpath_attrs,
+ .is_visible = SYSFS_GROUP_VISIBLE(multipath_sysfs),
};
#endif
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index aba365f97cf6..853bc67d045c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
- bool sync, bool last)
+ bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
@@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
- sync && empty && mutex_trylock(&queue->send_mutex)) {
+ empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
@@ -770,7 +770,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
- nvme_tcp_queue_request(req, false, true);
+
+ llist_add(&req->lentry, &queue->req_list);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@@ -2385,7 +2387,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (ret)
return ret;
- if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
+ if (ctrl->opts->concat && !ctrl->tls_pskid) {
/* See comments for nvme_tcp_key_revoke_needed() */
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
nvme_stop_keep_alive(ctrl);
@@ -2637,7 +2639,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req, true, true);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@@ -2789,7 +2791,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(rq);
- nvme_tcp_queue_request(req, true, bd->last);
+ nvme_tcp_queue_request(req, bd->last);
return BLK_STS_OK;
}
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index acc138bbf8f2..c7317299078d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
if (status != NVME_SC_SUCCESS)
goto complete;
- /*
- * Note: The NVMe specification allows multiple SQs to use the same CQ.
- * However, the target code does not really support that. So for now,
- * prevent this and fail the command if sqid and cqid are different.
- */
- if (!cqid || cqid != sqid) {
- pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid);
- status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR;
+ status = nvmet_check_io_cqid(ctrl, cqid, false);
+ if (status != NVME_SC_SUCCESS) {
+ pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid);
goto complete;
}
@@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
goto complete;
}
- status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1);
+ status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1);
complete:
nvmet_req_complete(req, status);
@@ -96,14 +91,15 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req)
goto complete;
}
- if (!cqid) {
- status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+ status = nvmet_check_io_cqid(ctrl, cqid, false);
+ if (status != NVME_SC_SUCCESS)
goto complete;
- }
- status = nvmet_check_cqid(ctrl, cqid);
- if (status != NVME_SC_SUCCESS)
+ if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) {
+ /* Some SQs are still using this CQ */
+ status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
goto complete;
+ }
status = ctrl->ops->delete_cq(ctrl, cqid);
@@ -127,12 +123,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req)
goto complete;
}
- if (!cqid) {
- status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
- goto complete;
- }
-
- status = nvmet_check_cqid(ctrl, cqid);
+ status = nvmet_check_io_cqid(ctrl, cqid, true);
if (status != NVME_SC_SUCCESS)
goto complete;
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 9429b8218408..b340380f3892 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -280,9 +280,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
bool nvmet_check_auth_status(struct nvmet_req *req)
{
- if (req->sq->ctrl->host_key &&
- !req->sq->authenticated)
- return false;
+ if (req->sq->ctrl->host_key) {
+ if (req->sq->qid > 0)
+ return true;
+ if (!req->sq->authenticated)
+ return false;
+ }
return true;
}
@@ -290,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
- struct shash_desc *shash;
+ SHASH_DESC_ON_STACK(shash, shash_tfm);
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1;
@@ -342,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
req->sq->dhchap_c1,
challenge, shash_len);
if (ret)
- goto out_free_challenge;
+ goto out;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
- shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
- GFP_KERNEL);
- if (!shash) {
- ret = -ENOMEM;
- goto out_free_challenge;
- }
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
@@ -389,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
goto out;
ret = crypto_shash_final(shash, response);
out:
- kfree(shash);
-out_free_challenge:
if (challenge != req->sq->dhchap_c1)
kfree(challenge);
out_free_response:
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 245475c43127..db7b17d1094e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -813,11 +813,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status)
}
EXPORT_SYMBOL_GPL(nvmet_req_complete);
+void nvmet_cq_init(struct nvmet_cq *cq)
+{
+ refcount_set(&cq->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_init);
+
+bool nvmet_cq_get(struct nvmet_cq *cq)
+{
+ return refcount_inc_not_zero(&cq->ref);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_get);
+
+void nvmet_cq_put(struct nvmet_cq *cq)
+{
+ if (refcount_dec_and_test(&cq->ref))
+ nvmet_cq_destroy(cq);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_put);
+
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
cq->qid = qid;
cq->size = size;
+
+ ctrl->cqs[qid] = cq;
+}
+
+void nvmet_cq_destroy(struct nvmet_cq *cq)
+{
+ struct nvmet_ctrl *ctrl = cq->ctrl;
+
+ if (ctrl) {
+ ctrl->cqs[cq->qid] = NULL;
+ nvmet_ctrl_put(cq->ctrl);
+ cq->ctrl = NULL;
+ }
}
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
@@ -837,37 +869,47 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
complete(&sq->confirm_done);
}
-u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid)
+u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
{
- if (!ctrl->sqs)
+ if (!ctrl->cqs)
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
if (cqid > ctrl->subsys->max_qid)
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
- /*
- * Note: For PCI controllers, the NVMe specifications allows multiple
- * SQs to share a single CQ. However, we do not support this yet, so
- * check that there is no SQ defined for a CQ. If one exist, then the
- * CQ ID is invalid for creation as well as when the CQ is being
- * deleted (as that would mean that the SQ was not deleted before the
- * CQ).
- */
- if (ctrl->sqs[cqid])
+ if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid]))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
return NVME_SC_SUCCESS;
}
+u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
+{
+ if (!cqid)
+ return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+ return nvmet_check_cqid(ctrl, cqid, create);
+}
+
+bool nvmet_cq_in_use(struct nvmet_cq *cq)
+{
+ return refcount_read(&cq->ref) > 1;
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_in_use);
+
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
u16 status;
- status = nvmet_check_cqid(ctrl, qid);
+ status = nvmet_check_cqid(ctrl, qid, true);
if (status != NVME_SC_SUCCESS)
return status;
+ if (!kref_get_unless_zero(&ctrl->ref))
+ return NVME_SC_INTERNAL | NVME_STATUS_DNR;
+ cq->ctrl = ctrl;
+
+ nvmet_cq_init(cq);
nvmet_cq_setup(ctrl, cq, qid, size);
return NVME_SC_SUCCESS;
@@ -891,7 +933,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid,
}
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
- u16 sqid, u16 size)
+ struct nvmet_cq *cq, u16 sqid, u16 size)
{
u16 status;
int ret;
@@ -903,7 +945,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
if (status != NVME_SC_SUCCESS)
return status;
- ret = nvmet_sq_init(sq);
+ ret = nvmet_sq_init(sq, cq);
if (ret) {
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
goto ctrl_put;
@@ -935,6 +977,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
+ nvmet_cq_put(sq->cq);
/*
* we must reference the ctrl again after waiting for inflight IO
@@ -967,18 +1010,23 @@ static void nvmet_sq_free(struct percpu_ref *ref)
complete(&sq->free_done);
}
-int nvmet_sq_init(struct nvmet_sq *sq)
+int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq)
{
int ret;
+ if (!nvmet_cq_get(cq))
+ return -EINVAL;
+
ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
if (ret) {
pr_err("percpu_ref init failed!\n");
+ nvmet_cq_put(cq);
return ret;
}
init_completion(&sq->free_done);
init_completion(&sq->confirm_done);
nvmet_auth_sq_init(sq);
+ sq->cq = cq;
return 0;
}
@@ -1108,13 +1156,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
return ret;
}
-bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
- struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
+ const struct nvmet_fabrics_ops *ops)
{
u8 flags = req->cmd->common.flags;
u16 status;
- req->cq = cq;
+ req->cq = sq->cq;
req->sq = sq;
req->ops = ops;
req->sg = NULL;
@@ -1612,12 +1660,17 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
if (!ctrl->sqs)
goto out_free_changed_ns_list;
+ ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *),
+ GFP_KERNEL);
+ if (!ctrl->cqs)
+ goto out_free_sqs;
+
ret = ida_alloc_range(&cntlid_ida,
subsys->cntlid_min, subsys->cntlid_max,
GFP_KERNEL);
if (ret < 0) {
args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
- goto out_free_sqs;
+ goto out_free_cqs;
}
ctrl->cntlid = ret;
@@ -1676,6 +1729,8 @@ init_pr_fail:
mutex_unlock(&subsys->lock);
nvmet_stop_keep_alive_timer(ctrl);
ida_free(&cntlid_ida, ctrl->cntlid);
+out_free_cqs:
+ kfree(ctrl->cqs);
out_free_sqs:
kfree(ctrl->sqs);
out_free_changed_ns_list:
@@ -1712,6 +1767,7 @@ static void nvmet_ctrl_free(struct kref *ref)
nvmet_async_events_free(ctrl);
kfree(ctrl->sqs);
+ kfree(ctrl->cqs);
kfree(ctrl->changed_ns_list);
kfree(ctrl);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index df7207640506..c06f3e04296c 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
- strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
+ strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
}
/*
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index f012bdf89850..7b8d8b397802 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
}
+ kref_get(&ctrl->ref);
+ old = cmpxchg(&req->cq->ctrl, NULL, ctrl);
+ if (old) {
+ pr_warn("queue already connected!\n");
+ req->error_loc = offsetof(struct nvmf_connect_command, opcode);
+ return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
+ }
+
/* note: convert queue size from 0's-based value to 1's-based value */
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
@@ -239,8 +247,8 @@ static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
bool needs_auth = nvmet_has_auth(ctrl, sq);
key_serial_t keyid = nvmet_queue_tls_keyid(sq);
- /* Do not authenticate I/O queues for secure concatenation */
- if (ctrl->concat && sq->qid)
+ /* Do not authenticate I/O queues */
+ if (sq->qid)
needs_auth = false;
if (keyid)
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 7b50130f10f6..254537b93e63 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -816,7 +816,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_fail_iodlist;
@@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
return queue;
out_fail_iodlist:
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
destroy_workqueue(queue->work_q);
out_free_queue:
@@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
flush_workqueue(queue->work_q);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_tgt_q_put(queue);
}
@@ -1254,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport,
{
lockdep_assert_held(&nvmet_fc_tgtlock);
+ nvmet_fc_tgtport_get(tgtport);
pe->tgtport = tgtport;
tgtport->pe = pe;
@@ -1273,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe)
unsigned long flags;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
- if (pe->tgtport)
+ if (pe->tgtport) {
+ nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport->pe = NULL;
+ }
list_del(&pe->pe_list);
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@@ -1292,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport)
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
pe = tgtport->pe;
- if (pe)
+ if (pe) {
+ nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport = NULL;
+ }
tgtport->pe = NULL;
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@@ -1316,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) {
if (tgtport->fc_target_port.node_name == pe->node_name &&
tgtport->fc_target_port.port_name == pe->port_name) {
+ if (!nvmet_fc_tgtport_get(tgtport))
+ continue;
+
WARN_ON(pe->tgtport);
tgtport->pe = pe;
pe->tgtport = tgtport;
@@ -1580,6 +1591,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
+static void
+nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport)
+{
+ struct nvmet_fc_ls_req_op *lsop;
+ struct nvmefc_ls_req *lsreq;
+ struct nvmet_fc_ls_iod *iod;
+ int i;
+
+ iod = tgtport->iod;
+ for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++)
+ cancel_work(&iod->work);
+
+ /*
+ * After this point the connection is lost and thus any pending
+ * request can't be processed by the normal completion path. This
+ * is likely a request from nvmet_fc_send_ls_req_async.
+ */
+ while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list,
+ struct nvmet_fc_ls_req_op, lsreq_list))) {
+ list_del(&lsop->lsreq_list);
+
+ if (!lsop->req_queued)
+ continue;
+
+ lsreq = &lsop->ls_req;
+ fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma,
+ (lsreq->rqstlen + lsreq->rsplen),
+ DMA_BIDIRECTIONAL);
+ nvmet_fc_tgtport_put(tgtport);
+ kfree(lsop);
+ }
+}
+
/**
* nvmet_fc_unregister_targetport - transport entry point called by an
* LLDD to deregister/remove a previously
@@ -1608,13 +1652,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
flush_workqueue(nvmet_wq);
- /*
- * should terminate LS's as well. However, LS's will be generated
- * at the tail end of association termination, so they likely don't
- * exist yet. And even if they did, it's worthwhile to just let
- * them finish and targetport ref counting will clean things up.
- */
-
+ nvmet_fc_free_pending_reqs(tgtport);
nvmet_fc_tgtport_put(tgtport);
return 0;
@@ -2531,10 +2569,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
fod->data_sg = NULL;
fod->data_sg_cnt = 0;
- ret = nvmet_req_init(&fod->req,
- &fod->queue->nvme_cq,
- &fod->queue->nvme_sq,
- &nvmet_fc_tgt_fcp_ops);
+ ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq,
+ &nvmet_fc_tgt_fcp_ops);
if (!ret) {
/* bad SQE content or invalid ctrl state */
/* nvmet layer has already called op done to send rsp. */
@@ -2860,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port)
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
(tgtport->fc_target_port.port_name == traddr.pn)) {
+ if (!nvmet_fc_tgtport_get(tgtport))
+ continue;
+
/* a FC port can only be 1 nvmet port id */
if (!tgtport->pe) {
nvmet_fc_portentry_bind(tgtport, pe, port);
ret = 0;
} else
ret = -EALREADY;
+
+ nvmet_fc_tgtport_put(tgtport);
break;
}
}
@@ -2881,11 +2922,21 @@ static void
nvmet_fc_remove_port(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
+ struct nvmet_fc_tgtport *tgtport = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+ if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
+ tgtport = pe->tgtport;
+ spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
nvmet_fc_portentry_unbind(pe);
- /* terminate any outstanding associations */
- __nvmet_fc_free_assocs(pe->tgtport);
+ if (tgtport) {
+ /* terminate any outstanding associations */
+ __nvmet_fc_free_assocs(tgtport);
+ nvmet_fc_tgtport_put(tgtport);
+ }
kfree(pe);
}
@@ -2894,10 +2945,21 @@ static void
nvmet_fc_discovery_chg(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
- struct nvmet_fc_tgtport *tgtport = pe->tgtport;
+ struct nvmet_fc_tgtport *tgtport = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+ if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
+ tgtport = pe->tgtport;
+ spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+ if (!tgtport)
+ return;
if (tgtport && tgtport->ops->discovery_event)
tgtport->ops->discovery_event(&tgtport->fc_target_port);
+
+ nvmet_fc_tgtport_put(tgtport);
}
static ssize_t
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 641201e62c1b..257b497d515a 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -207,7 +207,6 @@ static LIST_HEAD(fcloop_nports);
struct fcloop_lport {
struct nvme_fc_local_port *localport;
struct list_head lport_list;
- struct completion unreg_done;
refcount_t ref;
};
@@ -215,6 +214,9 @@ struct fcloop_lport_priv {
struct fcloop_lport *lport;
};
+/* The port is already being removed, avoid double free */
+#define PORT_DELETED 0
+
struct fcloop_rport {
struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport;
@@ -223,6 +225,7 @@ struct fcloop_rport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
+ unsigned long flags;
};
struct fcloop_tport {
@@ -233,6 +236,7 @@ struct fcloop_tport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
+ unsigned long flags;
};
struct fcloop_nport {
@@ -288,6 +292,9 @@ struct fcloop_ini_fcpreq {
spinlock_t inilock;
};
+/* SLAB cache for fcloop_lsreq structures */
+static struct kmem_cache *lsreq_cache;
+
static inline struct fcloop_lsreq *
ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp)
{
@@ -338,6 +345,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
+ kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&rport->lock);
}
@@ -349,10 +357,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport,
struct nvme_fc_remote_port *remoteport,
struct nvmefc_ls_req *lsreq)
{
- struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_lsreq *tls_req;
int ret = 0;
+ tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
+ if (!tls_req)
+ return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@@ -389,14 +400,17 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
lsrsp->done(lsrsp);
- if (remoteport) {
- rport = remoteport->private;
- spin_lock(&rport->lock);
- list_add_tail(&tls_req->ls_list, &rport->ls_list);
- spin_unlock(&rport->lock);
- queue_work(nvmet_wq, &rport->ls_work);
+ if (!remoteport) {
+ kmem_cache_free(lsreq_cache, tls_req);
+ return 0;
}
+ rport = remoteport->private;
+ spin_lock(&rport->lock);
+ list_add_tail(&tls_req->ls_list, &rport->ls_list);
+ spin_unlock(&rport->lock);
+ queue_work(nvmet_wq, &rport->ls_work);
+
return 0;
}
@@ -422,6 +436,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
+ kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&tport->lock);
}
@@ -432,8 +447,8 @@ static int
fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
struct nvmefc_ls_req *lsreq)
{
- struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_tport *tport = targetport->private;
+ struct fcloop_lsreq *tls_req;
int ret = 0;
/*
@@ -441,6 +456,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
* hosthandle ignored as fcloop currently is
* 1:1 tgtport vs remoteport
*/
+
+ tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
+ if (!tls_req)
+ return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@@ -457,6 +476,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp,
lsreq->rqstaddr, lsreq->rqstlen);
+ if (ret)
+ kmem_cache_free(lsreq_cache, tls_req);
+
return ret;
}
@@ -471,18 +493,30 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
struct nvmet_fc_target_port *targetport = rport->targetport;
struct fcloop_tport *tport;
+ if (!targetport) {
+ /*
+ * The target port is gone. The target doesn't expect any
+ * response anymore and the ->done call is not valid
+ * because the resources have been freed by
+ * nvmet_fc_free_pending_reqs.
+ *
+ * We end up here from delete association exchange:
+ * nvmet_fc_xmt_disconnect_assoc sends an async request.
+ */
+ kmem_cache_free(lsreq_cache, tls_req);
+ return 0;
+ }
+
memcpy(lsreq->rspaddr, lsrsp->rspbuf,
((lsreq->rsplen < lsrsp->rsplen) ?
lsreq->rsplen : lsrsp->rsplen));
lsrsp->done(lsrsp);
- if (targetport) {
- tport = targetport->private;
- spin_lock(&tport->lock);
- list_add_tail(&tls_req->ls_list, &tport->ls_list);
- spin_unlock(&tport->lock);
- queue_work(nvmet_wq, &tport->ls_work);
- }
+ tport = targetport->private;
+ spin_lock(&tport->lock);
+ list_add_tail(&tls_req->ls_list, &tport->ls_list);
+ spin_unlock(&tport->lock);
+ queue_work(nvmet_wq, &tport->ls_work);
return 0;
}
@@ -566,7 +600,8 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
}
/* release original io reference on tgt struct */
- fcloop_tfcp_req_put(tfcp_req);
+ if (tfcp_req)
+ fcloop_tfcp_req_put(tfcp_req);
}
static bool drop_fabric_opcode;
@@ -618,12 +653,13 @@ fcloop_fcp_recv_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
unsigned long flags;
int ret = 0;
bool aborted = false;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
+ fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_START:
tfcp_req->inistate = INI_IO_ACTIVE;
@@ -638,16 +674,19 @@ fcloop_fcp_recv_work(struct work_struct *work)
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
- if (unlikely(aborted))
- ret = -ECANCELED;
- else {
- if (likely(!check_for_drop(tfcp_req)))
- ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
- &tfcp_req->tgt_fcp_req,
- fcpreq->cmdaddr, fcpreq->cmdlen);
- else
- pr_info("%s: dropped command ********\n", __func__);
+ if (unlikely(aborted)) {
+ /* the abort handler will call fcloop_call_host_done */
+ return;
+ }
+
+ if (unlikely(check_for_drop(tfcp_req))) {
+ pr_info("%s: dropped command ********\n", __func__);
+ return;
}
+
+ ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req,
+ fcpreq->cmdaddr, fcpreq->cmdlen);
if (ret)
fcloop_call_host_done(fcpreq, tfcp_req, ret);
}
@@ -662,15 +701,17 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
unsigned long flags;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
- fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_ABORTED:
+ fcpreq = tfcp_req->fcpreq;
+ tfcp_req->fcpreq = NULL;
break;
case INI_IO_COMPLETED:
completed = true;
break;
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
+ fcloop_tfcp_req_put(tfcp_req);
WARN_ON(1);
return;
}
@@ -686,10 +727,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req);
- spin_lock_irqsave(&tfcp_req->reqlock, flags);
- tfcp_req->fcpreq = NULL;
- spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
-
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
/* call_host_done releases reference for abort downcall */
}
@@ -958,13 +995,16 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
spin_lock(&inireq->inilock);
tfcp_req = inireq->tfcp_req;
- if (tfcp_req)
- fcloop_tfcp_req_get(tfcp_req);
+ if (tfcp_req) {
+ if (!fcloop_tfcp_req_get(tfcp_req))
+ tfcp_req = NULL;
+ }
spin_unlock(&inireq->inilock);
- if (!tfcp_req)
+ if (!tfcp_req) {
/* abort has already been called */
- return;
+ goto out_host_done;
+ }
/* break initiator/target relationship for io */
spin_lock_irqsave(&tfcp_req->reqlock, flags);
@@ -979,7 +1019,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
WARN_ON(1);
- return;
+ goto out_host_done;
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
@@ -993,6 +1033,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
*/
fcloop_tfcp_req_put(tfcp_req);
}
+
+ return;
+
+out_host_done:
+ fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
}
static void
@@ -1019,9 +1064,18 @@ fcloop_lport_get(struct fcloop_lport *lport)
static void
fcloop_nport_put(struct fcloop_nport *nport)
{
+ unsigned long flags;
+
if (!refcount_dec_and_test(&nport->ref))
return;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ list_del(&nport->nport_list);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (nport->lport)
+ fcloop_lport_put(nport->lport);
+
kfree(nport);
}
@@ -1037,9 +1091,6 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport)
struct fcloop_lport_priv *lport_priv = localport->private;
struct fcloop_lport *lport = lport_priv->lport;
- /* release any threads waiting for the unreg to complete */
- complete(&lport->unreg_done);
-
fcloop_lport_put(lport);
}
@@ -1047,18 +1098,38 @@ static void
fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
{
struct fcloop_rport *rport = remoteport->private;
+ bool put_port = false;
+ unsigned long flags;
flush_work(&rport->ls_work);
- fcloop_nport_put(rport->nport);
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ if (!test_and_set_bit(PORT_DELETED, &rport->flags))
+ put_port = true;
+ rport->nport->rport = NULL;
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (put_port)
+ fcloop_nport_put(rport->nport);
}
static void
fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
{
struct fcloop_tport *tport = targetport->private;
+ bool put_port = false;
+ unsigned long flags;
flush_work(&tport->ls_work);
- fcloop_nport_put(tport->nport);
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ if (!test_and_set_bit(PORT_DELETED, &tport->flags))
+ put_port = true;
+ tport->nport->tport = NULL;
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (put_port)
+ fcloop_nport_put(tport->nport);
}
#define FCLOOP_HW_QUEUES 4
@@ -1082,7 +1153,6 @@ static struct nvme_fc_port_template fctemplate = {
/* sizes of additional private data for data structures */
.local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport),
- .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
};
@@ -1105,7 +1175,6 @@ static struct nvmet_fc_target_template tgttemplate = {
.target_features = 0,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
- .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
};
static ssize_t
@@ -1170,51 +1239,92 @@ out_free_lport:
}
static int
-__wait_localport_unreg(struct fcloop_lport *lport)
+__localport_unreg(struct fcloop_lport *lport)
{
- int ret;
+ return nvme_fc_unregister_localport(lport->localport);
+}
- init_completion(&lport->unreg_done);
+static struct fcloop_nport *
+__fcloop_nport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_nport *nport;
- ret = nvme_fc_unregister_localport(lport->localport);
+ list_for_each_entry(nport, &fcloop_nports, nport_list) {
+ if (nport->node_name != node_name ||
+ nport->port_name != port_name)
+ continue;
- if (!ret)
- wait_for_completion(&lport->unreg_done);
+ if (fcloop_nport_get(nport))
+ return nport;
- return ret;
+ break;
+ }
+
+ return NULL;
}
+static struct fcloop_nport *
+fcloop_nport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_nport *nport;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ nport = __fcloop_nport_lookup(node_name, port_name);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ return nport;
+}
+
+static struct fcloop_lport *
+__fcloop_lport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_lport *lport;
+
+ list_for_each_entry(lport, &fcloop_lports, lport_list) {
+ if (lport->localport->node_name != node_name ||
+ lport->localport->port_name != port_name)
+ continue;
+
+ if (fcloop_lport_get(lport))
+ return lport;
+
+ break;
+ }
+
+ return NULL;
+}
+
+static struct fcloop_lport *
+fcloop_lport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_lport *lport;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ lport = __fcloop_lport_lookup(node_name, port_name);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ return lport;
+}
static ssize_t
fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_lport *tlport, *lport = NULL;
+ struct fcloop_lport *lport;
u64 nodename, portname;
- unsigned long flags;
int ret;
ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tlport, &fcloop_lports, lport_list) {
- if (tlport->localport->node_name == nodename &&
- tlport->localport->port_name == portname) {
- if (!fcloop_lport_get(tlport))
- break;
- lport = tlport;
- break;
- }
- }
- spin_unlock_irqrestore(&fcloop_lock, flags);
-
+ lport = fcloop_lport_lookup(nodename, portname);
if (!lport)
return -ENOENT;
- ret = __wait_localport_unreg(lport);
+ ret = __localport_unreg(lport);
fcloop_lport_put(lport);
return ret ? ret : count;
@@ -1223,8 +1333,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
static struct fcloop_nport *
fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
{
- struct fcloop_nport *newnport, *nport = NULL;
- struct fcloop_lport *tmplport, *lport = NULL;
+ struct fcloop_nport *newnport, *nport;
+ struct fcloop_lport *lport;
struct fcloop_ctrl_options *opts;
unsigned long flags;
u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
@@ -1239,10 +1349,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
goto out_free_opts;
/* everything there ? */
- if ((opts->mask & opts_mask) != opts_mask) {
- ret = -EINVAL;
+ if ((opts->mask & opts_mask) != opts_mask)
goto out_free_opts;
- }
newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
if (!newnport)
@@ -1258,60 +1366,61 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
refcount_set(&newnport->ref, 1);
spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
- if (tmplport->localport->node_name == opts->wwnn &&
- tmplport->localport->port_name == opts->wwpn)
- goto out_invalid_opts;
-
- if (tmplport->localport->node_name == opts->lpwwnn &&
- tmplport->localport->port_name == opts->lpwwpn)
- lport = tmplport;
+ lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn);
+ if (lport) {
+ /* invalid configuration */
+ fcloop_lport_put(lport);
+ goto out_free_newnport;
}
if (remoteport) {
- if (!lport)
- goto out_invalid_opts;
- newnport->lport = lport;
- }
-
- list_for_each_entry(nport, &fcloop_nports, nport_list) {
- if (nport->node_name == opts->wwnn &&
- nport->port_name == opts->wwpn) {
- if ((remoteport && nport->rport) ||
- (!remoteport && nport->tport)) {
- nport = NULL;
- goto out_invalid_opts;
- }
-
- fcloop_nport_get(nport);
-
- spin_unlock_irqrestore(&fcloop_lock, flags);
-
- if (remoteport)
- nport->lport = lport;
- if (opts->mask & NVMF_OPT_ROLES)
- nport->port_role = opts->roles;
- if (opts->mask & NVMF_OPT_FCADDR)
- nport->port_id = opts->fcaddr;
+ lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn);
+ if (!lport) {
+ /* invalid configuration */
goto out_free_newnport;
}
}
- list_add_tail(&newnport->nport_list, &fcloop_nports);
+ nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn);
+ if (nport) {
+ if ((remoteport && nport->rport) ||
+ (!remoteport && nport->tport)) {
+ /* invalid configuration */
+ goto out_put_nport;
+ }
+
+ /* found existing nport, discard the new nport */
+ kfree(newnport);
+ } else {
+ list_add_tail(&newnport->nport_list, &fcloop_nports);
+ nport = newnport;
+ }
+ if (opts->mask & NVMF_OPT_ROLES)
+ nport->port_role = opts->roles;
+ if (opts->mask & NVMF_OPT_FCADDR)
+ nport->port_id = opts->fcaddr;
+ if (lport) {
+ if (!nport->lport)
+ nport->lport = lport;
+ else
+ fcloop_lport_put(lport);
+ }
spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(opts);
- return newnport;
+ return nport;
-out_invalid_opts:
- spin_unlock_irqrestore(&fcloop_lock, flags);
+out_put_nport:
+ if (lport)
+ fcloop_lport_put(lport);
+ fcloop_nport_put(nport);
out_free_newnport:
+ spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(newnport);
out_free_opts:
kfree(opts);
- return nport;
+ return NULL;
}
static ssize_t
@@ -1352,6 +1461,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
rport->nport = nport;
rport->lport = nport->lport;
nport->rport = rport;
+ rport->flags = 0;
spin_lock_init(&rport->lock);
INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
INIT_LIST_HEAD(&rport->ls_list);
@@ -1365,21 +1475,18 @@ __unlink_remote_port(struct fcloop_nport *nport)
{
struct fcloop_rport *rport = nport->rport;
+ lockdep_assert_held(&fcloop_lock);
+
if (rport && nport->tport)
nport->tport->remoteport = NULL;
nport->rport = NULL;
- list_del(&nport->nport_list);
-
return rport;
}
static int
__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
{
- if (!rport)
- return -EALREADY;
-
return nvme_fc_unregister_remoteport(rport->remoteport);
}
@@ -1387,8 +1494,8 @@ static ssize_t
fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_nport *nport = NULL, *tmpport;
- static struct fcloop_rport *rport;
+ struct fcloop_nport *nport;
+ struct fcloop_rport *rport;
u64 nodename, portname;
unsigned long flags;
int ret;
@@ -1397,24 +1504,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
- if (tmpport->node_name == nodename &&
- tmpport->port_name == portname && tmpport->rport) {
- nport = tmpport;
- rport = __unlink_remote_port(nport);
- break;
- }
- }
+ nport = fcloop_nport_lookup(nodename, portname);
+ if (!nport)
+ return -ENOENT;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ rport = __unlink_remote_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
- if (!nport)
- return -ENOENT;
+ if (!rport) {
+ ret = -ENOENT;
+ goto out_nport_put;
+ }
ret = __remoteport_unreg(nport, rport);
+out_nport_put:
+ fcloop_nport_put(nport);
+
return ret ? ret : count;
}
@@ -1452,6 +1559,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
tport->nport = nport;
tport->lport = nport->lport;
nport->tport = tport;
+ tport->flags = 0;
spin_lock_init(&tport->lock);
INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work);
INIT_LIST_HEAD(&tport->ls_list);
@@ -1465,6 +1573,8 @@ __unlink_target_port(struct fcloop_nport *nport)
{
struct fcloop_tport *tport = nport->tport;
+ lockdep_assert_held(&fcloop_lock);
+
if (tport && nport->rport)
nport->rport->targetport = NULL;
nport->tport = NULL;
@@ -1475,9 +1585,6 @@ __unlink_target_port(struct fcloop_nport *nport)
static int
__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
{
- if (!tport)
- return -EALREADY;
-
return nvmet_fc_unregister_targetport(tport->targetport);
}
@@ -1485,8 +1592,8 @@ static ssize_t
fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_nport *nport = NULL, *tmpport;
- struct fcloop_tport *tport = NULL;
+ struct fcloop_nport *nport;
+ struct fcloop_tport *tport;
u64 nodename, portname;
unsigned long flags;
int ret;
@@ -1495,24 +1602,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
- if (tmpport->node_name == nodename &&
- tmpport->port_name == portname && tmpport->tport) {
- nport = tmpport;
- tport = __unlink_target_port(nport);
- break;
- }
- }
+ nport = fcloop_nport_lookup(nodename, portname);
+ if (!nport)
+ return -ENOENT;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ tport = __unlink_target_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
- if (!nport)
- return -ENOENT;
+ if (!tport) {
+ ret = -ENOENT;
+ goto out_nport_put;
+ }
ret = __targetport_unreg(nport, tport);
+out_nport_put:
+ fcloop_nport_put(nport);
+
return ret ? ret : count;
}
@@ -1578,15 +1685,20 @@ static const struct class fcloop_class = {
};
static struct device *fcloop_device;
-
static int __init fcloop_init(void)
{
int ret;
+ lsreq_cache = kmem_cache_create("lsreq_cache",
+ sizeof(struct fcloop_lsreq), 0,
+ 0, NULL);
+ if (!lsreq_cache)
+ return -ENOMEM;
+
ret = class_register(&fcloop_class);
if (ret) {
pr_err("couldn't register class fcloop\n");
- return ret;
+ goto out_destroy_cache;
}
fcloop_device = device_create_with_groups(
@@ -1604,13 +1716,15 @@ static int __init fcloop_init(void)
out_destroy_class:
class_unregister(&fcloop_class);
+out_destroy_cache:
+ kmem_cache_destroy(lsreq_cache);
return ret;
}
static void __exit fcloop_exit(void)
{
- struct fcloop_lport *lport = NULL;
- struct fcloop_nport *nport = NULL;
+ struct fcloop_lport *lport;
+ struct fcloop_nport *nport;
struct fcloop_tport *tport;
struct fcloop_rport *rport;
unsigned long flags;
@@ -1621,7 +1735,7 @@ static void __exit fcloop_exit(void)
for (;;) {
nport = list_first_entry_or_null(&fcloop_nports,
typeof(*nport), nport_list);
- if (!nport)
+ if (!nport || !fcloop_nport_get(nport))
break;
tport = __unlink_target_port(nport);
@@ -1629,13 +1743,21 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
- ret = __targetport_unreg(nport, tport);
- if (ret)
- pr_warn("%s: Failed deleting target port\n", __func__);
+ if (tport) {
+ ret = __targetport_unreg(nport, tport);
+ if (ret)
+ pr_warn("%s: Failed deleting target port\n",
+ __func__);
+ }
- ret = __remoteport_unreg(nport, rport);
- if (ret)
- pr_warn("%s: Failed deleting remote port\n", __func__);
+ if (rport) {
+ ret = __remoteport_unreg(nport, rport);
+ if (ret)
+ pr_warn("%s: Failed deleting remote port\n",
+ __func__);
+ }
+
+ fcloop_nport_put(nport);
spin_lock_irqsave(&fcloop_lock, flags);
}
@@ -1648,7 +1770,7 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
- ret = __wait_localport_unreg(lport);
+ ret = __localport_unreg(lport);
if (ret)
pr_warn("%s: Failed deleting local port\n", __func__);
@@ -1663,6 +1785,7 @@ static void __exit fcloop_exit(void)
device_destroy(&fcloop_class, MKDEV(0, 0));
class_unregister(&fcloop_class);
+ kmem_cache_destroy(lsreq_cache);
}
module_init(fcloop_init);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index a5c41144667c..f85a8441bcc6 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -33,10 +33,12 @@ struct nvme_loop_ctrl {
struct list_head list;
struct blk_mq_tag_set tag_set;
- struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
struct nvmet_port *port;
+
+ /* Must be last --ends in a flexible-array member. */
+ struct nvme_loop_iod async_event_iod;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -148,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(req);
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
iod->req.port = queue->ctrl->port;
- if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
- &queue->nvme_sq, &nvme_loop_ops))
+ if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops))
return BLK_STS_OK;
if (blk_rq_nr_phys_segments(req)) {
@@ -181,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
- if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
- &nvme_loop_ops)) {
+ if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) {
dev_err(ctrl->ctrl.device, "failed async event work\n");
return;
}
@@ -273,6 +273,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
nvme_remove_admin_tag_set(&ctrl->ctrl);
}
@@ -302,6 +303,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[i].nvme_cq);
}
ctrl->ctrl.queue_count = 1;
/*
@@ -327,9 +329,13 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i <= nr_io_queues; i++) {
ctrl->queues[i].ctrl = ctrl;
- ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
- if (ret)
+ nvmet_cq_init(&ctrl->queues[i].nvme_cq);
+ ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq,
+ &ctrl->queues[i].nvme_cq);
+ if (ret) {
+ nvmet_cq_put(&ctrl->queues[i].nvme_cq);
goto out_destroy_queues;
+ }
ctrl->ctrl.queue_count++;
}
@@ -360,9 +366,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
int error;
ctrl->queues[0].ctrl = ctrl;
- error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
- if (error)
+ nvmet_cq_init(&ctrl->queues[0].nvme_cq);
+ error = nvmet_sq_init(&ctrl->queues[0].nvme_sq,
+ &ctrl->queues[0].nvme_cq);
+ if (error) {
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
+ }
ctrl->ctrl.queue_count = 1;
error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
@@ -401,6 +411,7 @@ out_cleanup_tagset:
nvme_remove_admin_tag_set(&ctrl->ctrl);
out_free_sq:
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index b6db8b74dc4a..df69a9dee71c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -141,13 +141,16 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
}
struct nvmet_cq {
+ struct nvmet_ctrl *ctrl;
u16 qid;
u16 size;
+ refcount_t ref;
};
struct nvmet_sq {
struct nvmet_ctrl *ctrl;
struct percpu_ref ref;
+ struct nvmet_cq *cq;
u16 qid;
u16 size;
u32 sqhd;
@@ -247,6 +250,7 @@ struct nvmet_pr_log_mgr {
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_sq **sqs;
+ struct nvmet_cq **cqs;
void *drvdata;
@@ -424,7 +428,7 @@ struct nvmet_fabrics_ops {
u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
/* Operations mandatory for PCI target controllers */
- u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags,
+ u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags,
u16 qsize, u64 prp1);
u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid);
u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags,
@@ -557,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req);
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req);
-bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
- struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
+ const struct nvmet_fabrics_ops *ops);
void nvmet_req_uninit(struct nvmet_req *req);
size_t nvmet_req_transfer_len(struct nvmet_req *req);
bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len);
@@ -571,18 +575,24 @@ void nvmet_execute_set_features(struct nvmet_req *req);
void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
-u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid);
+u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
+u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
+void nvmet_cq_init(struct nvmet_cq *cq);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
+void nvmet_cq_destroy(struct nvmet_cq *cq);
+bool nvmet_cq_get(struct nvmet_cq *cq);
+void nvmet_cq_put(struct nvmet_cq *cq);
+bool nvmet_cq_in_use(struct nvmet_cq *cq);
u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create);
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
u16 size);
-u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
- u16 size);
+u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
+ struct nvmet_cq *cq, u16 qid, u16 size);
void nvmet_sq_destroy(struct nvmet_sq *sq);
-int nvmet_sq_init(struct nvmet_sq *sq);
+int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq);
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
index 7fab7f3d79b7..a4295a5b8d28 100644
--- a/drivers/nvme/target/pci-epf.c
+++ b/drivers/nvme/target/pci-epf.c
@@ -62,8 +62,7 @@ static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex);
#define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1)
enum nvmet_pci_epf_queue_flags {
- NVMET_PCI_EPF_Q_IS_SQ = 0, /* The queue is a submission queue */
- NVMET_PCI_EPF_Q_LIVE, /* The queue is live */
+ NVMET_PCI_EPF_Q_LIVE = 0, /* The queue is live */
NVMET_PCI_EPF_Q_IRQ_ENABLED, /* IRQ is enabled for this queue */
};
@@ -596,9 +595,6 @@ static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
struct nvmet_pci_epf_irq_vector *iv = cq->iv;
bool ret;
- if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
- return false;
-
/* IRQ coalescing for the admin queue is not allowed. */
if (!cq->qid)
return true;
@@ -625,7 +621,8 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
struct pci_epf *epf = nvme_epf->epf;
int ret = 0;
- if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags))
+ if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) ||
+ !test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
return;
mutex_lock(&ctrl->irq_lock);
@@ -636,14 +633,16 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
switch (nvme_epf->irq_type) {
case PCI_IRQ_MSIX:
case PCI_IRQ_MSI:
+ /*
+ * If we fail to raise an MSI or MSI-X interrupt, it is likely
+ * because the host is using legacy INTX IRQs (e.g. BIOS,
+ * grub), but we can fallback to the INTX type only if the
+ * endpoint controller supports this type.
+ */
ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
nvme_epf->irq_type, cq->vector + 1);
- if (!ret)
+ if (!ret || !nvme_epf->epc_features->intx_capable)
break;
- /*
- * If we got an error, it is likely because the host is using
- * legacy IRQs (e.g. BIOS, grub).
- */
fallthrough;
case PCI_IRQ_INTX:
ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no,
@@ -656,7 +655,9 @@ static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl,
}
if (ret)
- dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret);
+ dev_err_ratelimited(ctrl->dev,
+ "CQ[%u]: Failed to raise IRQ (err=%d)\n",
+ cq->qid, ret);
unlock:
mutex_unlock(&ctrl->irq_lock);
@@ -1319,8 +1320,14 @@ static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl,
set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags);
- dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
- cqid, qsize, cq->qes, cq->vector);
+ if (test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
+ dev_dbg(ctrl->dev,
+ "CQ[%u]: %u entries of %zu B, IRQ vector %u\n",
+ cqid, qsize, cq->qes, cq->vector);
+ else
+ dev_dbg(ctrl->dev,
+ "CQ[%u]: %u entries of %zu B, IRQ disabled\n",
+ cqid, qsize, cq->qes);
return NVME_SC_SUCCESS;
@@ -1344,17 +1351,20 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
cancel_delayed_work_sync(&cq->work);
nvmet_pci_epf_drain_queue(cq);
- nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
+ if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
+ nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map);
+ nvmet_cq_put(&cq->nvme_cq);
return NVME_SC_SUCCESS;
}
static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
- u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
+ u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr)
{
struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
+ struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
u16 status;
if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
@@ -1377,7 +1387,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
sq->qes = ctrl->io_sqes;
sq->pci_size = sq->qes * sq->depth;
- status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
+ status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid,
+ sq->depth);
if (status != NVME_SC_SUCCESS)
return status;
@@ -1533,7 +1544,6 @@ static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl,
if (sq) {
queue = &ctrl->sq[qid];
- set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags);
} else {
queue = &ctrl->cq[qid];
INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work);
@@ -1594,8 +1604,7 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
goto complete;
}
- if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
- &nvmet_pci_epf_fabrics_ops))
+ if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops))
goto complete;
iod->data_len = nvmet_req_transfer_len(req);
@@ -1872,8 +1881,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
qsize = aqa & 0x00000fff;
pci_addr = asq & GENMASK_ULL(63, 12);
- status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
- qsize, pci_addr);
+ status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0,
+ NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr);
if (status != NVME_SC_SUCCESS) {
dev_err(ctrl->dev, "Failed to create admin submission queue\n");
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 2a4536ef6184..432bdf7cd49e 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
cmd->send_sge.addr, cmd->send_sge.length,
DMA_TO_DEVICE);
- if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
- &queue->nvme_sq, &nvmet_rdma_ops))
+ if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops))
return;
status = nvmet_rdma_map_sgl(cmd);
@@ -1353,6 +1352,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
pr_debug("freeing queue %d\n", queue->idx);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_rdma_destroy_queue_ib(queue);
if (!queue->nsrq) {
@@ -1436,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
goto out_reject;
}
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret) {
ret = NVME_RDMA_CM_NO_RSC;
goto out_free_queue;
@@ -1517,6 +1518,7 @@ out_ida_remove:
out_destroy_sq:
nvmet_sq_destroy(&queue->nvme_sq);
out_free_queue:
+ nvmet_cq_put(&queue->nvme_cq);
kfree(queue);
out_reject:
nvmet_rdma_cm_reject(cm_id, ret);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 12a5cb8641ca..c6603bd9c95e 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/err.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
@@ -17,7 +18,6 @@
#include <net/handshake.h>
#include <linux/inet.h>
#include <linux/llist.h>
-#include <crypto/hash.h>
#include <trace/events/sock.h>
#include "nvmet.h"
@@ -172,8 +172,6 @@ struct nvmet_tcp_queue {
/* digest state */
bool hdr_digest;
bool data_digest;
- struct ahash_request *snd_hash;
- struct ahash_request *rcv_hash;
/* TLS state */
key_serial_t tls_pskid;
@@ -294,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
-static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
- void *pdu, size_t len)
+static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
{
- struct scatterlist sg;
-
- sg_init_one(&sg, pdu, len);
- ahash_request_set_crypt(hash, &sg, pdu + len, len);
- crypto_ahash_digest(hash);
+ put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
}
static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
@@ -318,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
}
recv_digest = *(__le32 *)(pdu + hdr->hlen);
- nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+ nvmet_tcp_hdgst(pdu, len);
exp_digest = *(__le32 *)(pdu + hdr->hlen);
if (recv_digest != exp_digest) {
pr_err("queue %d: header digest error: recv %#x expected %#x\n",
@@ -441,12 +434,24 @@ err:
return NVME_SC_INTERNAL;
}
-static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
- struct nvmet_tcp_cmd *cmd)
+static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
{
- ahash_request_set_crypt(hash, cmd->req.sg,
- (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
- crypto_ahash_digest(hash);
+ size_t total_len = cmd->req.transfer_len;
+ struct scatterlist *sg = cmd->req.sg;
+ u32 crc = ~0;
+
+ while (total_len) {
+ size_t len = min_t(size_t, total_len, sg->length);
+
+ /*
+ * Note that the scatterlist does not contain any highmem pages,
+ * as it was allocated by sgl_alloc() with GFP_KERNEL.
+ */
+ crc = crc32c(crc, sg_virt(sg), len);
+ total_len -= len;
+ sg = sg_next(sg);
+ }
+ cmd->exp_ddgst = cpu_to_le32(~crc);
}
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -473,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
if (queue->data_digest) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
- nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
+ nvmet_tcp_calc_ddgst(cmd);
}
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
- struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@@ -503,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
- struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@@ -523,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
@@ -857,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
}
-static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
-
- ahash_request_free(queue->rcv_hash);
- ahash_request_free(queue->snd_hash);
- crypto_free_ahash(tfm);
-}
-
-static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
-{
- struct crypto_ahash *tfm;
-
- tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->snd_hash)
- goto free_tfm;
- ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
-
- queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->rcv_hash)
- goto free_snd_hash;
- ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
-
- return 0;
-free_snd_hash:
- ahash_request_free(queue->snd_hash);
-free_tfm:
- crypto_free_ahash(tfm);
- return -ENOMEM;
-}
-
-
static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
@@ -921,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
- if (queue->hdr_digest || queue->data_digest) {
- ret = nvmet_tcp_alloc_crypto(queue);
- if (ret)
- return ret;
- }
memset(icresp, 0, sizeof(*icresp));
icresp->hdr.type = nvme_tcp_icresp;
@@ -1077,8 +1039,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
req = &queue->cmd->req;
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
- if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
- &queue->nvme_sq, &nvmet_tcp_ops))) {
+ if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
@@ -1247,7 +1208,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
{
struct nvmet_tcp_queue *queue = cmd->queue;
- nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
+ nvmet_tcp_calc_ddgst(cmd);
queue->offset = 0;
queue->left = NVME_TCP_DIGEST_LENGTH;
queue->rcv_state = NVMET_TCP_RECV_DDGST;
@@ -1615,13 +1576,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
nvmet_sq_put_tls_key(&queue->nvme_sq);
nvmet_tcp_uninit_data_in_cmds(queue);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
cancel_work_sync(&queue->io_work);
nvmet_tcp_free_cmd_data_in_buffers(queue);
/* ->sock will be released by fput() */
fput(queue->sock->file);
nvmet_tcp_free_cmds(queue);
- if (queue->hdr_digest || queue->data_digest)
- nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_queue_ida, queue->idx);
page_frag_cache_drain(&queue->pf_cache);
kfree(queue);
@@ -1950,7 +1910,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
if (ret)
goto out_ida_remove;
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_free_connect;
@@ -1993,6 +1954,7 @@ out_destroy_sq:
mutex_unlock(&nvmet_tcp_queue_mutex);
nvmet_sq_destroy(&queue->nvme_sq);
out_free_connect:
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_tcp_free_cmd(&queue->connect);
out_ida_remove:
ida_free(&nvmet_tcp_queue_ida, queue->idx);
diff --git a/drivers/phy/phy-can-transceiver.c b/drivers/phy/phy-can-transceiver.c
index 2bec70615449..f59caff4b3d4 100644
--- a/drivers/phy/phy-can-transceiver.c
+++ b/drivers/phy/phy-can-transceiver.c
@@ -93,6 +93,16 @@ static const struct of_device_id can_transceiver_phy_ids[] = {
};
MODULE_DEVICE_TABLE(of, can_transceiver_phy_ids);
+/* Temporary wrapper until the multiplexer subsystem supports optional muxes */
+static inline struct mux_state *
+devm_mux_state_get_optional(struct device *dev, const char *mux_name)
+{
+ if (!of_property_present(dev->of_node, "mux-states"))
+ return NULL;
+
+ return devm_mux_state_get(dev, mux_name);
+}
+
static int can_transceiver_phy_probe(struct platform_device *pdev)
{
struct phy_provider *phy_provider;
@@ -114,13 +124,11 @@ static int can_transceiver_phy_probe(struct platform_device *pdev)
match = of_match_node(can_transceiver_phy_ids, pdev->dev.of_node);
drvdata = match->data;
- mux_state = devm_mux_state_get(dev, NULL);
- if (IS_ERR(mux_state)) {
- if (PTR_ERR(mux_state) == -EPROBE_DEFER)
- return PTR_ERR(mux_state);
- } else {
- can_transceiver_phy->mux_state = mux_state;
- }
+ mux_state = devm_mux_state_get_optional(dev, NULL);
+ if (IS_ERR(mux_state))
+ return PTR_ERR(mux_state);
+
+ can_transceiver_phy->mux_state = mux_state;
phy = devm_phy_create(dev, dev->of_node,
&can_transceiver_phy_ops);
diff --git a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
index 45b3b792696e..b33e2e2b5014 100644
--- a/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
+++ b/drivers/phy/qualcomm/phy-qcom-qmp-ufs.c
@@ -1754,7 +1754,8 @@ static void qmp_ufs_init_registers(struct qmp_ufs *qmp, const struct qmp_phy_cfg
qmp_ufs_init_all(qmp, &cfg->tbls_hs_overlay[i]);
}
- qmp_ufs_init_all(qmp, &cfg->tbls_hs_b);
+ if (qmp->mode == PHY_MODE_UFS_HS_B)
+ qmp_ufs_init_all(qmp, &cfg->tbls_hs_b);
}
static int qmp_ufs_com_init(struct qmp_ufs *qmp)
diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c
index 775f4f973a6c..9fdf17e0848a 100644
--- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c
+++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c
@@ -9,6 +9,7 @@
* Copyright (C) 2014 Cogent Embedded, Inc.
*/
+#include <linux/cleanup.h>
#include <linux/extcon-provider.h>
#include <linux/interrupt.h>
#include <linux/io.h>
@@ -107,7 +108,6 @@ struct rcar_gen3_phy {
struct rcar_gen3_chan *ch;
u32 int_enable_bits;
bool initialized;
- bool otg_initialized;
bool powered;
};
@@ -119,9 +119,8 @@ struct rcar_gen3_chan {
struct regulator *vbus;
struct reset_control *rstc;
struct work_struct work;
- struct mutex lock; /* protects rphys[...].powered */
+ spinlock_t lock; /* protects access to hardware and driver data structure. */
enum usb_dr_mode dr_mode;
- int irq;
u32 obint_enable_bits;
bool extcon_host;
bool is_otg_channel;
@@ -320,16 +319,15 @@ static bool rcar_gen3_is_any_rphy_initialized(struct rcar_gen3_chan *ch)
return false;
}
-static bool rcar_gen3_needs_init_otg(struct rcar_gen3_chan *ch)
+static bool rcar_gen3_is_any_otg_rphy_initialized(struct rcar_gen3_chan *ch)
{
- int i;
-
- for (i = 0; i < NUM_OF_PHYS; i++) {
- if (ch->rphys[i].otg_initialized)
- return false;
+ for (enum rcar_gen3_phy_index i = PHY_INDEX_BOTH_HC; i <= PHY_INDEX_EHCI;
+ i++) {
+ if (ch->rphys[i].initialized)
+ return true;
}
- return true;
+ return false;
}
static bool rcar_gen3_are_all_rphys_power_off(struct rcar_gen3_chan *ch)
@@ -351,7 +349,9 @@ static ssize_t role_store(struct device *dev, struct device_attribute *attr,
bool is_b_device;
enum phy_mode cur_mode, new_mode;
- if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch))
+ guard(spinlock_irqsave)(&ch->lock);
+
+ if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch))
return -EIO;
if (sysfs_streq(buf, "host"))
@@ -389,7 +389,7 @@ static ssize_t role_show(struct device *dev, struct device_attribute *attr,
{
struct rcar_gen3_chan *ch = dev_get_drvdata(dev);
- if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch))
+ if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch))
return -EIO;
return sprintf(buf, "%s\n", rcar_gen3_is_host(ch) ? "host" :
@@ -402,6 +402,9 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch)
void __iomem *usb2_base = ch->base;
u32 val;
+ if (!ch->is_otg_channel || rcar_gen3_is_any_otg_rphy_initialized(ch))
+ return;
+
/* Should not use functions of read-modify-write a register */
val = readl(usb2_base + USB2_LINECTRL1);
val = (val & ~USB2_LINECTRL1_DP_RPD) | USB2_LINECTRL1_DPRPD_EN |
@@ -415,7 +418,7 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch)
val = readl(usb2_base + USB2_ADPCTRL);
writel(val | USB2_ADPCTRL_IDPULLUP, usb2_base + USB2_ADPCTRL);
}
- msleep(20);
+ mdelay(20);
writel(0xffffffff, usb2_base + USB2_OBINTSTA);
writel(ch->obint_enable_bits, usb2_base + USB2_OBINTEN);
@@ -427,16 +430,27 @@ static irqreturn_t rcar_gen3_phy_usb2_irq(int irq, void *_ch)
{
struct rcar_gen3_chan *ch = _ch;
void __iomem *usb2_base = ch->base;
- u32 status = readl(usb2_base + USB2_OBINTSTA);
+ struct device *dev = ch->dev;
irqreturn_t ret = IRQ_NONE;
+ u32 status;
+
+ pm_runtime_get_noresume(dev);
+
+ if (pm_runtime_suspended(dev))
+ goto rpm_put;
- if (status & ch->obint_enable_bits) {
- dev_vdbg(ch->dev, "%s: %08x\n", __func__, status);
- writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA);
- rcar_gen3_device_recognition(ch);
- ret = IRQ_HANDLED;
+ scoped_guard(spinlock, &ch->lock) {
+ status = readl(usb2_base + USB2_OBINTSTA);
+ if (status & ch->obint_enable_bits) {
+ dev_vdbg(dev, "%s: %08x\n", __func__, status);
+ writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA);
+ rcar_gen3_device_recognition(ch);
+ ret = IRQ_HANDLED;
+ }
}
+rpm_put:
+ pm_runtime_put_noidle(dev);
return ret;
}
@@ -446,32 +460,23 @@ static int rcar_gen3_phy_usb2_init(struct phy *p)
struct rcar_gen3_chan *channel = rphy->ch;
void __iomem *usb2_base = channel->base;
u32 val;
- int ret;
- if (!rcar_gen3_is_any_rphy_initialized(channel) && channel->irq >= 0) {
- INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work);
- ret = request_irq(channel->irq, rcar_gen3_phy_usb2_irq,
- IRQF_SHARED, dev_name(channel->dev), channel);
- if (ret < 0) {
- dev_err(channel->dev, "No irq handler (%d)\n", channel->irq);
- return ret;
- }
- }
+ guard(spinlock_irqsave)(&channel->lock);
/* Initialize USB2 part */
val = readl(usb2_base + USB2_INT_ENABLE);
val |= USB2_INT_ENABLE_UCOM_INTEN | rphy->int_enable_bits;
writel(val, usb2_base + USB2_INT_ENABLE);
- writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET);
- writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET);
-
- /* Initialize otg part */
- if (channel->is_otg_channel) {
- if (rcar_gen3_needs_init_otg(channel))
- rcar_gen3_init_otg(channel);
- rphy->otg_initialized = true;
+
+ if (!rcar_gen3_is_any_rphy_initialized(channel)) {
+ writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET);
+ writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET);
}
+ /* Initialize otg part (only if we initialize a PHY with IRQs). */
+ if (rphy->int_enable_bits)
+ rcar_gen3_init_otg(channel);
+
rphy->initialized = true;
return 0;
@@ -484,10 +489,9 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p)
void __iomem *usb2_base = channel->base;
u32 val;
- rphy->initialized = false;
+ guard(spinlock_irqsave)(&channel->lock);
- if (channel->is_otg_channel)
- rphy->otg_initialized = false;
+ rphy->initialized = false;
val = readl(usb2_base + USB2_INT_ENABLE);
val &= ~rphy->int_enable_bits;
@@ -495,9 +499,6 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p)
val &= ~USB2_INT_ENABLE_UCOM_INTEN;
writel(val, usb2_base + USB2_INT_ENABLE);
- if (channel->irq >= 0 && !rcar_gen3_is_any_rphy_initialized(channel))
- free_irq(channel->irq, channel);
-
return 0;
}
@@ -509,16 +510,17 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p)
u32 val;
int ret = 0;
- mutex_lock(&channel->lock);
- if (!rcar_gen3_are_all_rphys_power_off(channel))
- goto out;
-
if (channel->vbus) {
ret = regulator_enable(channel->vbus);
if (ret)
- goto out;
+ return ret;
}
+ guard(spinlock_irqsave)(&channel->lock);
+
+ if (!rcar_gen3_are_all_rphys_power_off(channel))
+ goto out;
+
val = readl(usb2_base + USB2_USBCTR);
val |= USB2_USBCTR_PLL_RST;
writel(val, usb2_base + USB2_USBCTR);
@@ -528,7 +530,6 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p)
out:
/* The powered flag should be set for any other phys anyway */
rphy->powered = true;
- mutex_unlock(&channel->lock);
return 0;
}
@@ -539,18 +540,20 @@ static int rcar_gen3_phy_usb2_power_off(struct phy *p)
struct rcar_gen3_chan *channel = rphy->ch;
int ret = 0;
- mutex_lock(&channel->lock);
- rphy->powered = false;
+ scoped_guard(spinlock_irqsave, &channel->lock) {
+ rphy->powered = false;
- if (!rcar_gen3_are_all_rphys_power_off(channel))
- goto out;
+ if (rcar_gen3_are_all_rphys_power_off(channel)) {
+ u32 val = readl(channel->base + USB2_USBCTR);
+
+ val |= USB2_USBCTR_PLL_RST;
+ writel(val, channel->base + USB2_USBCTR);
+ }
+ }
if (channel->vbus)
ret = regulator_disable(channel->vbus);
-out:
- mutex_unlock(&channel->lock);
-
return ret;
}
@@ -703,7 +706,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev)
struct device *dev = &pdev->dev;
struct rcar_gen3_chan *channel;
struct phy_provider *provider;
- int ret = 0, i;
+ int ret = 0, i, irq;
if (!dev->of_node) {
dev_err(dev, "This driver needs device tree\n");
@@ -719,8 +722,6 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev)
return PTR_ERR(channel->base);
channel->obint_enable_bits = USB2_OBINT_BITS;
- /* get irq number here and request_irq for OTG in phy_init */
- channel->irq = platform_get_irq_optional(pdev, 0);
channel->dr_mode = rcar_gen3_get_dr_mode(dev->of_node);
if (channel->dr_mode != USB_DR_MODE_UNKNOWN) {
channel->is_otg_channel = true;
@@ -763,7 +764,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev)
if (phy_data->no_adp_ctrl)
channel->obint_enable_bits = USB2_OBINT_IDCHG_EN;
- mutex_init(&channel->lock);
+ spin_lock_init(&channel->lock);
for (i = 0; i < NUM_OF_PHYS; i++) {
channel->rphys[i].phy = devm_phy_create(dev, NULL,
phy_data->phy_usb2_ops);
@@ -789,6 +790,20 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev)
channel->vbus = NULL;
}
+ irq = platform_get_irq_optional(pdev, 0);
+ if (irq < 0 && irq != -ENXIO) {
+ ret = irq;
+ goto error;
+ } else if (irq > 0) {
+ INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work);
+ ret = devm_request_irq(dev, irq, rcar_gen3_phy_usb2_irq,
+ IRQF_SHARED, dev_name(dev), channel);
+ if (ret < 0) {
+ dev_err(dev, "Failed to request irq (%d)\n", irq);
+ goto error;
+ }
+ }
+
provider = devm_of_phy_provider_register(dev, rcar_gen3_phy_usb2_xlate);
if (IS_ERR(provider)) {
dev_err(dev, "Failed to register PHY provider\n");
diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c b/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c
index 08c78c1bafc9..28a052e17366 100644
--- a/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c
+++ b/drivers/phy/rockchip/phy-rockchip-samsung-dcphy.c
@@ -1653,7 +1653,7 @@ static __maybe_unused int samsung_mipi_dcphy_runtime_resume(struct device *dev)
return ret;
}
- clk_prepare_enable(samsung->ref_clk);
+ ret = clk_prepare_enable(samsung->ref_clk);
if (ret) {
dev_err(samsung->dev, "Failed to enable reference clock, %d\n", ret);
clk_disable_unprepare(samsung->pclk);
diff --git a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c
index fe7c05748356..77236f012a1f 100644
--- a/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c
+++ b/drivers/phy/rockchip/phy-rockchip-samsung-hdptx.c
@@ -476,6 +476,8 @@ static const struct ropll_config ropll_tmds_cfg[] = {
1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, },
{ 650000, 162, 162, 1, 1, 11, 1, 1, 1, 1, 1, 1, 1, 54, 0, 16, 4, 1,
1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, },
+ { 502500, 84, 84, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 11, 1, 4, 5,
+ 4, 11, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, },
{ 337500, 0x70, 0x70, 1, 1, 0xf, 1, 1, 1, 1, 1, 1, 1, 0x2, 0, 0x01, 5,
1, 1, 1, 0, 0x20, 0x0c, 1, 0x0e, 0, 0, },
{ 400000, 100, 100, 1, 1, 11, 1, 1, 0, 1, 0, 1, 1, 0x9, 0, 0x05, 0,
diff --git a/drivers/phy/starfive/phy-jh7110-usb.c b/drivers/phy/starfive/phy-jh7110-usb.c
index cb5454fbe2c8..b505d89860b4 100644
--- a/drivers/phy/starfive/phy-jh7110-usb.c
+++ b/drivers/phy/starfive/phy-jh7110-usb.c
@@ -18,6 +18,8 @@
#include <linux/usb/of.h>
#define USB_125M_CLK_RATE 125000000
+#define USB_CLK_MODE_OFF 0x0
+#define USB_CLK_MODE_RX_NORMAL_PWR BIT(1)
#define USB_LS_KEEPALIVE_OFF 0x4
#define USB_LS_KEEPALIVE_ENABLE BIT(4)
@@ -78,6 +80,7 @@ static int jh7110_usb2_phy_init(struct phy *_phy)
{
struct jh7110_usb2_phy *phy = phy_get_drvdata(_phy);
int ret;
+ unsigned int val;
ret = clk_set_rate(phy->usb_125m_clk, USB_125M_CLK_RATE);
if (ret)
@@ -87,6 +90,10 @@ static int jh7110_usb2_phy_init(struct phy *_phy)
if (ret)
return ret;
+ val = readl(phy->regs + USB_CLK_MODE_OFF);
+ val |= USB_CLK_MODE_RX_NORMAL_PWR;
+ writel(val, phy->regs + USB_CLK_MODE_OFF);
+
return 0;
}
diff --git a/drivers/phy/tegra/xusb-tegra186.c b/drivers/phy/tegra/xusb-tegra186.c
index fae6242aa730..23a23f2d64e5 100644
--- a/drivers/phy/tegra/xusb-tegra186.c
+++ b/drivers/phy/tegra/xusb-tegra186.c
@@ -237,6 +237,8 @@
#define DATA0_VAL_PD BIT(1)
#define USE_XUSB_AO BIT(4)
+#define TEGRA_UTMI_PAD_MAX 4
+
#define TEGRA186_LANE(_name, _offset, _shift, _mask, _type) \
{ \
.name = _name, \
@@ -269,7 +271,7 @@ struct tegra186_xusb_padctl {
/* UTMI bias and tracking */
struct clk *usb2_trk_clk;
- unsigned int bias_pad_enable;
+ DECLARE_BITMAP(utmi_pad_enabled, TEGRA_UTMI_PAD_MAX);
/* padctl context */
struct tegra186_xusb_padctl_context context;
@@ -603,12 +605,8 @@ static void tegra186_utmi_bias_pad_power_on(struct tegra_xusb_padctl *padctl)
u32 value;
int err;
- mutex_lock(&padctl->lock);
-
- if (priv->bias_pad_enable++ > 0) {
- mutex_unlock(&padctl->lock);
+ if (!bitmap_empty(priv->utmi_pad_enabled, TEGRA_UTMI_PAD_MAX))
return;
- }
err = clk_prepare_enable(priv->usb2_trk_clk);
if (err < 0)
@@ -658,8 +656,6 @@ static void tegra186_utmi_bias_pad_power_on(struct tegra_xusb_padctl *padctl)
} else {
clk_disable_unprepare(priv->usb2_trk_clk);
}
-
- mutex_unlock(&padctl->lock);
}
static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl)
@@ -667,17 +663,8 @@ static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl)
struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl);
u32 value;
- mutex_lock(&padctl->lock);
-
- if (WARN_ON(priv->bias_pad_enable == 0)) {
- mutex_unlock(&padctl->lock);
- return;
- }
-
- if (--priv->bias_pad_enable > 0) {
- mutex_unlock(&padctl->lock);
+ if (!bitmap_empty(priv->utmi_pad_enabled, TEGRA_UTMI_PAD_MAX))
return;
- }
value = padctl_readl(padctl, XUSB_PADCTL_USB2_BIAS_PAD_CTL1);
value |= USB2_PD_TRK;
@@ -690,13 +677,13 @@ static void tegra186_utmi_bias_pad_power_off(struct tegra_xusb_padctl *padctl)
clk_disable_unprepare(priv->usb2_trk_clk);
}
- mutex_unlock(&padctl->lock);
}
static void tegra186_utmi_pad_power_on(struct phy *phy)
{
struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+ struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl);
struct tegra_xusb_usb2_port *port;
struct device *dev = padctl->dev;
unsigned int index = lane->index;
@@ -705,9 +692,16 @@ static void tegra186_utmi_pad_power_on(struct phy *phy)
if (!phy)
return;
+ mutex_lock(&padctl->lock);
+ if (test_bit(index, priv->utmi_pad_enabled)) {
+ mutex_unlock(&padctl->lock);
+ return;
+ }
+
port = tegra_xusb_find_usb2_port(padctl, index);
if (!port) {
dev_err(dev, "no port found for USB2 lane %u\n", index);
+ mutex_unlock(&padctl->lock);
return;
}
@@ -724,18 +718,28 @@ static void tegra186_utmi_pad_power_on(struct phy *phy)
value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index));
value &= ~USB2_OTG_PD_DR;
padctl_writel(padctl, value, XUSB_PADCTL_USB2_OTG_PADX_CTL1(index));
+
+ set_bit(index, priv->utmi_pad_enabled);
+ mutex_unlock(&padctl->lock);
}
static void tegra186_utmi_pad_power_down(struct phy *phy)
{
struct tegra_xusb_lane *lane = phy_get_drvdata(phy);
struct tegra_xusb_padctl *padctl = lane->pad->padctl;
+ struct tegra186_xusb_padctl *priv = to_tegra186_xusb_padctl(padctl);
unsigned int index = lane->index;
u32 value;
if (!phy)
return;
+ mutex_lock(&padctl->lock);
+ if (!test_bit(index, priv->utmi_pad_enabled)) {
+ mutex_unlock(&padctl->lock);
+ return;
+ }
+
dev_dbg(padctl->dev, "power down UTMI pad %u\n", index);
value = padctl_readl(padctl, XUSB_PADCTL_USB2_OTG_PADX_CTL0(index));
@@ -748,7 +752,11 @@ static void tegra186_utmi_pad_power_down(struct phy *phy)
udelay(2);
+ clear_bit(index, priv->utmi_pad_enabled);
+
tegra186_utmi_bias_pad_power_off(padctl);
+
+ mutex_unlock(&padctl->lock);
}
static int tegra186_xusb_padctl_vbus_override(struct tegra_xusb_padctl *padctl,
diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
index 79d4814d758d..c89df95aa6ca 100644
--- a/drivers/phy/tegra/xusb.c
+++ b/drivers/phy/tegra/xusb.c
@@ -548,16 +548,16 @@ static int tegra_xusb_port_init(struct tegra_xusb_port *port,
err = dev_set_name(&port->dev, "%s-%u", name, index);
if (err < 0)
- goto unregister;
+ goto put_device;
err = device_add(&port->dev);
if (err < 0)
- goto unregister;
+ goto put_device;
return 0;
-unregister:
- device_unregister(&port->dev);
+put_device:
+ put_device(&port->dev);
return err;
}
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 82f0cc43bbf4..0eb816395dc6 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -44,7 +44,6 @@
* @pctrl: pinctrl handle.
* @chip: gpiochip handle.
* @desc: pin controller descriptor
- * @restart_nb: restart notifier block.
* @irq: parent irq for the TLMM irq_chip.
* @intr_target_use_scm: route irq to application cpu using scm calls
* @lock: Spinlock to protect register resources as well
@@ -64,7 +63,6 @@ struct msm_pinctrl {
struct pinctrl_dev *pctrl;
struct gpio_chip chip;
struct pinctrl_desc desc;
- struct notifier_block restart_nb;
int irq;
@@ -1471,10 +1469,9 @@ static int msm_gpio_init(struct msm_pinctrl *pctrl)
return 0;
}
-static int msm_ps_hold_restart(struct notifier_block *nb, unsigned long action,
- void *data)
+static int msm_ps_hold_restart(struct sys_off_data *data)
{
- struct msm_pinctrl *pctrl = container_of(nb, struct msm_pinctrl, restart_nb);
+ struct msm_pinctrl *pctrl = data->cb_data;
writel(0, pctrl->regs[0] + PS_HOLD_OFFSET);
mdelay(1000);
@@ -1485,7 +1482,11 @@ static struct msm_pinctrl *poweroff_pctrl;
static void msm_ps_hold_poweroff(void)
{
- msm_ps_hold_restart(&poweroff_pctrl->restart_nb, 0, NULL);
+ struct sys_off_data data = {
+ .cb_data = poweroff_pctrl,
+ };
+
+ msm_ps_hold_restart(&data);
}
static void msm_pinctrl_setup_pm_reset(struct msm_pinctrl *pctrl)
@@ -1495,9 +1496,11 @@ static void msm_pinctrl_setup_pm_reset(struct msm_pinctrl *pctrl)
for (i = 0; i < pctrl->soc->nfunctions; i++)
if (!strcmp(func[i].name, "ps_hold")) {
- pctrl->restart_nb.notifier_call = msm_ps_hold_restart;
- pctrl->restart_nb.priority = 128;
- if (register_restart_handler(&pctrl->restart_nb))
+ if (devm_register_sys_off_handler(pctrl->dev,
+ SYS_OFF_MODE_RESTART,
+ 128,
+ msm_ps_hold_restart,
+ pctrl))
dev_err(pctrl->dev,
"failed to setup restart handler.\n");
poweroff_pctrl = pctrl;
@@ -1599,8 +1602,6 @@ void msm_pinctrl_remove(struct platform_device *pdev)
struct msm_pinctrl *pctrl = platform_get_drvdata(pdev);
gpiochip_remove(&pctrl->chip);
-
- unregister_restart_handler(&pctrl->restart_nb);
}
EXPORT_SYMBOL(msm_pinctrl_remove);
diff --git a/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c b/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c
index 230e6ee96636..d8f1bf5e58a0 100644
--- a/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c
+++ b/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c
@@ -45,7 +45,7 @@ static ssize_t current_password_store(struct kobject *kobj,
int length;
length = strlen(buf);
- if (buf[length-1] == '\n')
+ if (length && buf[length - 1] == '\n')
length--;
/* firmware does verifiation of min/max password length,
diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c
index a0eae24ca9e6..162809140f68 100644
--- a/drivers/platform/x86/fujitsu-laptop.c
+++ b/drivers/platform/x86/fujitsu-laptop.c
@@ -17,13 +17,13 @@
/*
* fujitsu-laptop.c - Fujitsu laptop support, providing access to additional
* features made available on a range of Fujitsu laptops including the
- * P2xxx/P5xxx/S6xxx/S7xxx series.
+ * P2xxx/P5xxx/S2xxx/S6xxx/S7xxx series.
*
* This driver implements a vendor-specific backlight control interface for
* Fujitsu laptops and provides support for hotkeys present on certain Fujitsu
* laptops.
*
- * This driver has been tested on a Fujitsu Lifebook S6410, S7020 and
+ * This driver has been tested on a Fujitsu Lifebook S2110, S6410, S7020 and
* P8010. It should work on most P-series and S-series Lifebooks, but
* YMMV.
*
@@ -107,7 +107,11 @@
#define KEY2_CODE 0x411
#define KEY3_CODE 0x412
#define KEY4_CODE 0x413
-#define KEY5_CODE 0x420
+#define KEY5_CODE 0x414
+#define KEY6_CODE 0x415
+#define KEY7_CODE 0x416
+#define KEY8_CODE 0x417
+#define KEY9_CODE 0x420
/* Hotkey ringbuffer limits */
#define MAX_HOTKEY_RINGBUFFER_SIZE 100
@@ -560,7 +564,7 @@ static const struct key_entry keymap_default[] = {
{ KE_KEY, KEY2_CODE, { KEY_PROG2 } },
{ KE_KEY, KEY3_CODE, { KEY_PROG3 } },
{ KE_KEY, KEY4_CODE, { KEY_PROG4 } },
- { KE_KEY, KEY5_CODE, { KEY_RFKILL } },
+ { KE_KEY, KEY9_CODE, { KEY_RFKILL } },
/* Soft keys read from status flags */
{ KE_KEY, FLAG_RFKILL, { KEY_RFKILL } },
{ KE_KEY, FLAG_TOUCHPAD_TOGGLE, { KEY_TOUCHPAD_TOGGLE } },
@@ -584,6 +588,18 @@ static const struct key_entry keymap_p8010[] = {
{ KE_END, 0 }
};
+static const struct key_entry keymap_s2110[] = {
+ { KE_KEY, KEY1_CODE, { KEY_PROG1 } }, /* "A" */
+ { KE_KEY, KEY2_CODE, { KEY_PROG2 } }, /* "B" */
+ { KE_KEY, KEY3_CODE, { KEY_WWW } }, /* "Internet" */
+ { KE_KEY, KEY4_CODE, { KEY_EMAIL } }, /* "E-mail" */
+ { KE_KEY, KEY5_CODE, { KEY_STOPCD } },
+ { KE_KEY, KEY6_CODE, { KEY_PLAYPAUSE } },
+ { KE_KEY, KEY7_CODE, { KEY_PREVIOUSSONG } },
+ { KE_KEY, KEY8_CODE, { KEY_NEXTSONG } },
+ { KE_END, 0 }
+};
+
static const struct key_entry *keymap = keymap_default;
static int fujitsu_laptop_dmi_keymap_override(const struct dmi_system_id *id)
@@ -621,6 +637,15 @@ static const struct dmi_system_id fujitsu_laptop_dmi_table[] = {
},
.driver_data = (void *)keymap_p8010
},
+ {
+ .callback = fujitsu_laptop_dmi_keymap_override,
+ .ident = "Fujitsu LifeBook S2110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK S2110"),
+ },
+ .driver_data = (void *)keymap_s2110
+ },
{}
};
diff --git a/drivers/platform/x86/intel/pmc/arl.c b/drivers/platform/x86/intel/pmc/arl.c
index 320993bd6d31..f9c48738b853 100644
--- a/drivers/platform/x86/intel/pmc/arl.c
+++ b/drivers/platform/x86/intel/pmc/arl.c
@@ -681,6 +681,7 @@ static struct pmc_info arl_pmc_info_list[] = {
#define ARL_NPU_PCI_DEV 0xad1d
#define ARL_GNA_PCI_DEV 0xae4c
+#define ARL_H_NPU_PCI_DEV 0x7d1d
#define ARL_H_GNA_PCI_DEV 0x774c
/*
* Set power state of select devices that do not have drivers to D3
@@ -694,7 +695,7 @@ static void arl_d3_fixup(void)
static void arl_h_d3_fixup(void)
{
- pmc_core_set_device_d3(ARL_NPU_PCI_DEV);
+ pmc_core_set_device_d3(ARL_H_NPU_PCI_DEV);
pmc_core_set_device_d3(ARL_H_GNA_PCI_DEV);
}
diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c
index 0fc275e461be..00b1e7c79a3d 100644
--- a/drivers/platform/x86/think-lmi.c
+++ b/drivers/platform/x86/think-lmi.c
@@ -1061,8 +1061,8 @@ static ssize_t current_value_store(struct kobject *kobj,
ret = -EINVAL;
goto out;
}
- set_str = kasprintf(GFP_KERNEL, "%s,%s,%s", setting->display_name,
- new_setting, tlmi_priv.pwd_admin->signature);
+ set_str = kasprintf(GFP_KERNEL, "%s,%s,%s", setting->name,
+ new_setting, tlmi_priv.pwd_admin->signature);
if (!set_str) {
ret = -ENOMEM;
goto out;
@@ -1092,7 +1092,7 @@ static ssize_t current_value_store(struct kobject *kobj,
goto out;
}
- set_str = kasprintf(GFP_KERNEL, "%s,%s;", setting->display_name,
+ set_str = kasprintf(GFP_KERNEL, "%s,%s;", setting->name,
new_setting);
if (!set_str) {
ret = -ENOMEM;
@@ -1120,11 +1120,11 @@ static ssize_t current_value_store(struct kobject *kobj,
}
if (auth_str)
- set_str = kasprintf(GFP_KERNEL, "%s,%s,%s", setting->display_name,
- new_setting, auth_str);
+ set_str = kasprintf(GFP_KERNEL, "%s,%s,%s", setting->name,
+ new_setting, auth_str);
else
- set_str = kasprintf(GFP_KERNEL, "%s,%s;", setting->display_name,
- new_setting);
+ set_str = kasprintf(GFP_KERNEL, "%s,%s;", setting->name,
+ new_setting);
if (!set_str) {
ret = -ENOMEM;
goto out;
@@ -1629,9 +1629,6 @@ static int tlmi_analyze(struct wmi_device *wdev)
continue;
}
- /* It is not allowed to have '/' for file name. Convert it into '\'. */
- strreplace(item, '/', '\\');
-
/* Remove the value part */
strreplace(item, ',', '\0');
@@ -1644,11 +1641,16 @@ static int tlmi_analyze(struct wmi_device *wdev)
}
setting->wdev = wdev;
setting->index = i;
+
+ strscpy(setting->name, item);
+ /* It is not allowed to have '/' for file name. Convert it into '\'. */
+ strreplace(item, '/', '\\');
strscpy(setting->display_name, item);
+
/* If BIOS selections supported, load those */
if (tlmi_priv.can_get_bios_selections) {
- ret = tlmi_get_bios_selections(setting->display_name,
- &setting->possible_values);
+ ret = tlmi_get_bios_selections(setting->name,
+ &setting->possible_values);
if (ret || !setting->possible_values)
pr_info("Error retrieving possible values for %d : %s\n",
i, setting->display_name);
diff --git a/drivers/platform/x86/think-lmi.h b/drivers/platform/x86/think-lmi.h
index a80452482227..9b014644d316 100644
--- a/drivers/platform/x86/think-lmi.h
+++ b/drivers/platform/x86/think-lmi.h
@@ -90,6 +90,7 @@ struct tlmi_attr_setting {
struct kobject kobj;
struct wmi_device *wdev;
int index;
+ char name[TLMI_SETTINGS_MAXLEN];
char display_name[TLMI_SETTINGS_MAXLEN];
char *possible_values;
};
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 92b21e49faf6..657625dd60a0 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -231,6 +231,7 @@ enum tpacpi_hkey_event_t {
/* Thermal events */
TP_HKEY_EV_ALARM_BAT_HOT = 0x6011, /* battery too hot */
TP_HKEY_EV_ALARM_BAT_XHOT = 0x6012, /* battery critically hot */
+ TP_HKEY_EV_ALARM_BAT_LIM_CHANGE = 0x6013, /* battery charge limit changed*/
TP_HKEY_EV_ALARM_SENSOR_HOT = 0x6021, /* sensor too hot */
TP_HKEY_EV_ALARM_SENSOR_XHOT = 0x6022, /* sensor critically hot */
TP_HKEY_EV_THM_TABLE_CHANGED = 0x6030, /* windows; thermal table changed */
@@ -3777,6 +3778,10 @@ static bool hotkey_notify_6xxx(const u32 hkey, bool *send_acpi_ev)
pr_alert("THERMAL EMERGENCY: battery is extremely hot!\n");
/* recommended action: immediate sleep/hibernate */
break;
+ case TP_HKEY_EV_ALARM_BAT_LIM_CHANGE:
+ pr_debug("Battery Info: battery charge threshold changed\n");
+ /* User changed charging threshold. No action needed */
+ return true;
case TP_HKEY_EV_ALARM_SENSOR_HOT:
pr_crit("THERMAL ALARM: a sensor reports something is too hot!\n");
/* recommended action: warn user through gui, that */
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 9b2f28b34bb5..d6c1ddb807b2 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -3126,7 +3126,7 @@ struct device *genpd_dev_pm_attach_by_id(struct device *dev,
/* Verify that the index is within a valid range. */
num_domains = of_count_phandle_with_args(dev->of_node, "power-domains",
"#power-domain-cells");
- if (index >= num_domains)
+ if (num_domains < 0 || index >= num_domains)
return NULL;
/* Allocate and register device on the genpd bus. */
diff --git a/drivers/pmdomain/renesas/rcar-gen4-sysc.c b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
index 66409cff2083..e001b5c25bed 100644
--- a/drivers/pmdomain/renesas/rcar-gen4-sysc.c
+++ b/drivers/pmdomain/renesas/rcar-gen4-sysc.c
@@ -338,11 +338,6 @@ static int __init rcar_gen4_sysc_pd_init(void)
struct rcar_gen4_sysc_pd *pd;
size_t n;
- if (!area->name) {
- /* Skip NULLified area */
- continue;
- }
-
n = strlen(area->name) + 1;
pd = kzalloc(sizeof(*pd) + n, GFP_KERNEL);
if (!pd) {
diff --git a/drivers/pmdomain/renesas/rcar-sysc.c b/drivers/pmdomain/renesas/rcar-sysc.c
index dce1a6d37e80..047495f54e8a 100644
--- a/drivers/pmdomain/renesas/rcar-sysc.c
+++ b/drivers/pmdomain/renesas/rcar-sysc.c
@@ -396,11 +396,6 @@ static int __init rcar_sysc_pd_init(void)
struct rcar_sysc_pd *pd;
size_t n;
- if (!area->name) {
- /* Skip NULLified area */
- continue;
- }
-
n = strlen(area->name) + 1;
pd = kzalloc(sizeof(*pd) + n, GFP_KERNEL);
if (!pd) {
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 2ccdca4f6960..e63481f24238 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -315,6 +315,8 @@ struct ptp_ocp_serial_port {
#define OCP_BOARD_ID_LEN 13
#define OCP_SERIAL_LEN 6
#define OCP_SMA_NUM 4
+#define OCP_SIGNAL_NUM 4
+#define OCP_FREQ_NUM 4
enum {
PORT_GNSS,
@@ -342,8 +344,8 @@ struct ptp_ocp {
struct dcf_master_reg __iomem *dcf_out;
struct dcf_slave_reg __iomem *dcf_in;
struct tod_reg __iomem *nmea_out;
- struct frequency_reg __iomem *freq_in[4];
- struct ptp_ocp_ext_src *signal_out[4];
+ struct frequency_reg __iomem *freq_in[OCP_FREQ_NUM];
+ struct ptp_ocp_ext_src *signal_out[OCP_SIGNAL_NUM];
struct ptp_ocp_ext_src *pps;
struct ptp_ocp_ext_src *ts0;
struct ptp_ocp_ext_src *ts1;
@@ -378,10 +380,12 @@ struct ptp_ocp {
u32 utc_tai_offset;
u32 ts_window_adjust;
u64 fw_cap;
- struct ptp_ocp_signal signal[4];
+ struct ptp_ocp_signal signal[OCP_SIGNAL_NUM];
struct ptp_ocp_sma_connector sma[OCP_SMA_NUM];
const struct ocp_sma_op *sma_op;
struct dpll_device *dpll;
+ int signals_nr;
+ int freq_in_nr;
};
#define OCP_REQ_TIMESTAMP BIT(0)
@@ -2697,6 +2701,8 @@ ptp_ocp_fb_board_init(struct ptp_ocp *bp, struct ocp_resource *r)
bp->eeprom_map = fb_eeprom_map;
bp->fw_version = ioread32(&bp->image->version);
bp->sma_op = &ocp_fb_sma_op;
+ bp->signals_nr = 4;
+ bp->freq_in_nr = 4;
ptp_ocp_fb_set_version(bp);
@@ -2862,6 +2868,8 @@ ptp_ocp_art_board_init(struct ptp_ocp *bp, struct ocp_resource *r)
bp->fw_version = ioread32(&bp->reg->version);
bp->fw_tag = 2;
bp->sma_op = &ocp_art_sma_op;
+ bp->signals_nr = 4;
+ bp->freq_in_nr = 4;
/* Enable MAC serial port during initialisation */
iowrite32(1, &bp->board_config->mro50_serial_activate);
@@ -2888,6 +2896,8 @@ ptp_ocp_adva_board_init(struct ptp_ocp *bp, struct ocp_resource *r)
bp->flash_start = 0xA00000;
bp->eeprom_map = fb_eeprom_map;
bp->sma_op = &ocp_adva_sma_op;
+ bp->signals_nr = 2;
+ bp->freq_in_nr = 2;
version = ioread32(&bp->image->version);
/* if lower 16 bits are empty, this is the fw loader. */
@@ -4008,7 +4018,7 @@ _signal_summary_show(struct seq_file *s, struct ptp_ocp *bp, int nr)
{
struct signal_reg __iomem *reg = bp->signal_out[nr]->mem;
struct ptp_ocp_signal *signal = &bp->signal[nr];
- char label[8];
+ char label[16];
bool on;
u32 val;
@@ -4031,7 +4041,7 @@ static void
_frequency_summary_show(struct seq_file *s, int nr,
struct frequency_reg __iomem *reg)
{
- char label[8];
+ char label[16];
bool on;
u32 val;
@@ -4175,11 +4185,11 @@ ptp_ocp_summary_show(struct seq_file *s, void *data)
}
if (bp->fw_cap & OCP_CAP_SIGNAL)
- for (i = 0; i < 4; i++)
+ for (i = 0; i < bp->signals_nr; i++)
_signal_summary_show(s, bp, i);
if (bp->fw_cap & OCP_CAP_FREQ)
- for (i = 0; i < 4; i++)
+ for (i = 0; i < bp->freq_in_nr; i++)
_frequency_summary_show(s, i, bp->freq_in[i]);
if (bp->irig_out) {
diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c
index 59eb23d467ec..198d45f8e884 100644
--- a/drivers/regulator/max20086-regulator.c
+++ b/drivers/regulator/max20086-regulator.c
@@ -132,7 +132,7 @@ static int max20086_regulators_register(struct max20086 *chip)
static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on)
{
- struct of_regulator_match matches[MAX20086_MAX_REGULATORS] = { };
+ struct of_regulator_match *matches;
struct device_node *node;
unsigned int i;
int ret;
@@ -143,6 +143,11 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on)
return -ENODEV;
}
+ matches = devm_kcalloc(chip->dev, chip->info->num_outputs,
+ sizeof(*matches), GFP_KERNEL);
+ if (!matches)
+ return -ENOMEM;
+
for (i = 0; i < chip->info->num_outputs; ++i)
matches[i].name = max20086_output_names[i];
diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c
index 775b056d795a..2c7e519a2254 100644
--- a/drivers/remoteproc/qcom_wcnss.c
+++ b/drivers/remoteproc/qcom_wcnss.c
@@ -456,7 +456,8 @@ static int wcnss_init_regulators(struct qcom_wcnss *wcnss,
if (wcnss->num_pds) {
info += wcnss->num_pds;
/* Handle single power domain case */
- num_vregs += num_pd_vregs - wcnss->num_pds;
+ if (wcnss->num_pds < num_pd_vregs)
+ num_vregs += num_pd_vregs - wcnss->num_pds;
} else {
num_vregs += num_pd_vregs;
}
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 5a3c670aec27..5522310bab8d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -403,6 +403,7 @@ config SCSI_ACARD
config SCSI_AHA152X
tristate "Adaptec AHA152X/2825 support"
depends on ISA && SCSI
+ depends on !HIGHMEM
select SCSI_SPI_ATTRS
select CHECK_SIGNATURE
help
@@ -795,6 +796,7 @@ config SCSI_PPA
tristate "IOMEGA parallel port (ppa - older drives)"
depends on SCSI && PARPORT_PC
depends on HAS_IOPORT
+ depends on !HIGHMEM
help
This driver supports older versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).
@@ -822,6 +824,7 @@ config SCSI_PPA
config SCSI_IMM
tristate "IOMEGA parallel port (imm - newer drives)"
depends on SCSI && PARPORT_PC
+ depends on !HIGHMEM
help
This driver supports newer versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index 4276f868cd91..e94c0a19c435 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -746,7 +746,6 @@ struct Scsi_Host *aha152x_probe_one(struct aha152x_setup *setup)
/* need to have host registered before triggering any interrupt */
list_add_tail(&HOSTDATA(shpnt)->host_list, &aha152x_host_list);
- shpnt->no_highmem = true;
shpnt->io_port = setup->io_port;
shpnt->n_io_port = IO_RANGE;
shpnt->irq = setup->irq;
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 1d4c7310f1a6..0821cf994b98 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -1224,7 +1224,6 @@ static int __imm_attach(struct parport *pb)
host = scsi_host_alloc(&imm_template, sizeof(imm_struct *));
if (!host)
goto out1;
- host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index a06329b47851..1ed3171f1797 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -1104,7 +1104,6 @@ static int __ppa_attach(struct parport *pb)
host = scsi_host_alloc(&ppa_template, sizeof(ppa_struct *));
if (!host)
goto out1;
- host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index 2fa45556e1ea..0ddc95bafc71 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -601,7 +601,7 @@ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write,
}
if (bytes) {
- err = blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO);
+ err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO);
if (err)
goto error;
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1b43013d72c0..144c72f0737a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -313,8 +313,7 @@ retry:
return PTR_ERR(req);
if (bufflen) {
- ret = blk_rq_map_kern(sdev->request_queue, req,
- buffer, bufflen, GFP_NOIO);
+ ret = blk_rq_map_kern(req, buffer, bufflen, GFP_NOIO);
if (ret)
goto out;
}
@@ -2004,9 +2003,6 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
lim->dma_alignment = max_t(unsigned int,
shost->dma_alignment, dma_get_cache_alignment() - 1);
- if (shost->no_highmem)
- lim->features |= BLK_FEAT_BOUNCE_HIGH;
-
/*
* Propagate the DMA formation properties to the dma-mapping layer as
* a courtesy service to the LLDDs. This needs to check that the buses
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7a447ff600d2..a8db66428f80 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -169,6 +169,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
unsigned int nr_zones, size_t *buflen)
{
struct request_queue *q = sdkp->disk->queue;
+ unsigned int max_segments;
size_t bufsize;
void *buf;
@@ -180,12 +181,15 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp,
* Furthermore, since the report zone command cannot be split, make
* sure that the allocated buffer can always be mapped by limiting the
* number of pages allocated to the HBA max segments limit.
+ * Since max segments can be larger than the max inline bio vectors,
+ * further limit the allocated buffer to BIO_MAX_INLINE_VECS.
*/
nr_zones = min(nr_zones, sdkp->zone_info.nr_zones);
bufsize = roundup((nr_zones + 1) * 64, SECTOR_SIZE);
bufsize = min_t(size_t, bufsize,
queue_max_hw_sectors(q) << SECTOR_SHIFT);
- bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+ max_segments = min(BIO_MAX_INLINE_VECS, queue_max_segments(q));
+ bufsize = min_t(size_t, bufsize, max_segments << PAGE_SHIFT);
while (bufsize >= SECTOR_SIZE) {
buf = kvzalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 35db061ae3ec..2e6b2412d2c9 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1819,6 +1819,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd)
return SCSI_MLQUEUE_DEVICE_BUSY;
}
+ payload->rangecount = 1;
payload->range.len = length;
payload->range.offset = offset_in_hvpg;
diff --git a/drivers/soc/samsung/exynos-usi.c b/drivers/soc/samsung/exynos-usi.c
index c5661ac19f7b..5f7bdf3bab05 100644
--- a/drivers/soc/samsung/exynos-usi.c
+++ b/drivers/soc/samsung/exynos-usi.c
@@ -233,7 +233,7 @@ static void exynos_usi_unconfigure(void *data)
/* Make sure that we've stopped providing the clock to USI IP */
val = readl(usi->regs + USI_OPTION);
val &= ~USI_OPTION_CLKREQ_ON;
- val |= ~USI_OPTION_CLKSTOP_ON;
+ val |= USI_OPTION_CLKSTOP_ON;
writel(val, usi->regs + USI_OPTION);
/* Set USI block state to reset */
diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c
index 6f8a20014e76..39aecd34c641 100644
--- a/drivers/soundwire/bus.c
+++ b/drivers/soundwire/bus.c
@@ -122,6 +122,10 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
set_bit(SDW_GROUP13_DEV_NUM, bus->assigned);
set_bit(SDW_MASTER_DEV_NUM, bus->assigned);
+ ret = sdw_irq_create(bus, fwnode);
+ if (ret)
+ return ret;
+
/*
* SDW is an enumerable bus, but devices can be powered off. So,
* they won't be able to report as present.
@@ -138,6 +142,7 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
if (ret < 0) {
dev_err(bus->dev, "Finding slaves failed:%d\n", ret);
+ sdw_irq_delete(bus);
return ret;
}
@@ -156,10 +161,6 @@ int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent,
bus->params.curr_bank = SDW_BANK0;
bus->params.next_bank = SDW_BANK1;
- ret = sdw_irq_create(bus, fwnode);
- if (ret)
- return ret;
-
return 0;
}
EXPORT_SYMBOL(sdw_bus_master_add);
diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c
index 067c954cb6ea..863781ba6c16 100644
--- a/drivers/spi/spi-fsl-dspi.c
+++ b/drivers/spi/spi-fsl-dspi.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0+
//
// Copyright 2013 Freescale Semiconductor, Inc.
-// Copyright 2020 NXP
+// Copyright 2020-2025 NXP
//
// Freescale DSPI driver
// This file contains a driver for the Freescale DSPI
@@ -62,6 +62,7 @@
#define SPI_SR_TFIWF BIT(18)
#define SPI_SR_RFDF BIT(17)
#define SPI_SR_CMDFFF BIT(16)
+#define SPI_SR_TXRXS BIT(30)
#define SPI_SR_CLEAR (SPI_SR_TCFQF | \
SPI_SR_TFUF | SPI_SR_TFFF | \
SPI_SR_CMDTCF | SPI_SR_SPEF | \
@@ -921,9 +922,20 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
struct spi_transfer *transfer;
bool cs = false;
int status = 0;
+ u32 val = 0;
+ bool cs_change = false;
message->actual_length = 0;
+ /* Put DSPI in running mode if halted. */
+ regmap_read(dspi->regmap, SPI_MCR, &val);
+ if (val & SPI_MCR_HALT) {
+ regmap_update_bits(dspi->regmap, SPI_MCR, SPI_MCR_HALT, 0);
+ while (regmap_read(dspi->regmap, SPI_SR, &val) >= 0 &&
+ !(val & SPI_SR_TXRXS))
+ ;
+ }
+
list_for_each_entry(transfer, &message->transfers, transfer_list) {
dspi->cur_transfer = transfer;
dspi->cur_msg = message;
@@ -953,6 +965,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
dspi->tx_cmd |= SPI_PUSHR_CMD_CONT;
}
+ cs_change = transfer->cs_change;
dspi->tx = transfer->tx_buf;
dspi->rx = transfer->rx_buf;
dspi->len = transfer->len;
@@ -962,6 +975,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF,
SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF);
+ regmap_write(dspi->regmap, SPI_SR, SPI_SR_CLEAR);
+
spi_take_timestamp_pre(dspi->ctlr, dspi->cur_transfer,
dspi->progress, !dspi->irq);
@@ -988,6 +1003,15 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr,
dspi_deassert_cs(spi, &cs);
}
+ if (status || !cs_change) {
+ /* Put DSPI in stop mode */
+ regmap_update_bits(dspi->regmap, SPI_MCR,
+ SPI_MCR_HALT, SPI_MCR_HALT);
+ while (regmap_read(dspi->regmap, SPI_SR, &val) >= 0 &&
+ val & SPI_SR_TXRXS)
+ ;
+ }
+
message->status = status;
spi_finalize_current_message(ctlr);
@@ -1167,6 +1191,20 @@ static int dspi_resume(struct device *dev)
static SIMPLE_DEV_PM_OPS(dspi_pm, dspi_suspend, dspi_resume);
+static const struct regmap_range dspi_yes_ranges[] = {
+ regmap_reg_range(SPI_MCR, SPI_MCR),
+ regmap_reg_range(SPI_TCR, SPI_CTAR(3)),
+ regmap_reg_range(SPI_SR, SPI_TXFR3),
+ regmap_reg_range(SPI_RXFR0, SPI_RXFR3),
+ regmap_reg_range(SPI_CTARE(0), SPI_CTARE(3)),
+ regmap_reg_range(SPI_SREX, SPI_SREX),
+};
+
+static const struct regmap_access_table dspi_access_table = {
+ .yes_ranges = dspi_yes_ranges,
+ .n_yes_ranges = ARRAY_SIZE(dspi_yes_ranges),
+};
+
static const struct regmap_range dspi_volatile_ranges[] = {
regmap_reg_range(SPI_MCR, SPI_TCR),
regmap_reg_range(SPI_SR, SPI_SR),
@@ -1184,6 +1222,8 @@ static const struct regmap_config dspi_regmap_config = {
.reg_stride = 4,
.max_register = 0x88,
.volatile_table = &dspi_volatile_table,
+ .rd_table = &dspi_access_table,
+ .wr_table = &dspi_access_table,
};
static const struct regmap_range dspi_xspi_volatile_ranges[] = {
@@ -1205,6 +1245,8 @@ static const struct regmap_config dspi_xspi_regmap_config[] = {
.reg_stride = 4,
.max_register = 0x13c,
.volatile_table = &dspi_xspi_volatile_table,
+ .rd_table = &dspi_access_table,
+ .wr_table = &dspi_access_table,
},
{
.name = "pushr",
@@ -1227,6 +1269,8 @@ static int dspi_init(struct fsl_dspi *dspi)
if (!spi_controller_is_target(dspi->ctlr))
mcr |= SPI_MCR_HOST;
+ mcr |= SPI_MCR_HALT;
+
regmap_write(dspi->regmap, SPI_MCR, mcr);
regmap_write(dspi->regmap, SPI_SR, SPI_SR_CLEAR);
diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c
index 31a878d9458d..7740f94847a8 100644
--- a/drivers/spi/spi-loopback-test.c
+++ b/drivers/spi/spi-loopback-test.c
@@ -420,7 +420,7 @@ MODULE_LICENSE("GPL");
static void spi_test_print_hex_dump(char *pre, const void *ptr, size_t len)
{
/* limit the hex_dump */
- if (len < 1024) {
+ if (len <= 1024) {
print_hex_dump(KERN_INFO, pre,
DUMP_PREFIX_OFFSET, 16, 1,
ptr, len, 0);
diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c
index f89826d7dc49..aa92fd5a35a9 100644
--- a/drivers/spi/spi-sun4i.c
+++ b/drivers/spi/spi-sun4i.c
@@ -264,6 +264,9 @@ static int sun4i_spi_transfer_one(struct spi_controller *host,
else
reg |= SUN4I_CTL_DHB;
+ /* Now that the settings are correct, enable the interface */
+ reg |= SUN4I_CTL_ENABLE;
+
sun4i_spi_write(sspi, SUN4I_CTL_REG, reg);
/* Ensure that we have a parent clock fast enough */
@@ -404,7 +407,7 @@ static int sun4i_spi_runtime_resume(struct device *dev)
}
sun4i_spi_write(sspi, SUN4I_CTL_REG,
- SUN4I_CTL_ENABLE | SUN4I_CTL_MASTER | SUN4I_CTL_TP);
+ SUN4I_CTL_MASTER | SUN4I_CTL_TP);
return 0;
diff --git a/drivers/spi/spi-tegra114.c b/drivers/spi/spi-tegra114.c
index 2a8bb798e95b..795a8482c2c7 100644
--- a/drivers/spi/spi-tegra114.c
+++ b/drivers/spi/spi-tegra114.c
@@ -728,9 +728,9 @@ static int tegra_spi_set_hw_cs_timing(struct spi_device *spi)
u32 inactive_cycles;
u8 cs_state;
- if ((setup->unit && setup->unit != SPI_DELAY_UNIT_SCK) ||
- (hold->unit && hold->unit != SPI_DELAY_UNIT_SCK) ||
- (inactive->unit && inactive->unit != SPI_DELAY_UNIT_SCK)) {
+ if ((setup->value && setup->unit != SPI_DELAY_UNIT_SCK) ||
+ (hold->value && hold->unit != SPI_DELAY_UNIT_SCK) ||
+ (inactive->value && inactive->unit != SPI_DELAY_UNIT_SCK)) {
dev_err(&spi->dev,
"Invalid delay unit %d, should be SPI_DELAY_UNIT_SCK\n",
SPI_DELAY_UNIT_SCK);
diff --git a/drivers/thermal/intel/x86_pkg_temp_thermal.c b/drivers/thermal/intel/x86_pkg_temp_thermal.c
index 496abf8e55e0..2841d14914b7 100644
--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -329,6 +329,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu)
tj_max = intel_tcc_get_tjmax(cpu);
if (tj_max < 0)
return tj_max;
+ tj_max *= 1000;
zonedev = kzalloc(sizeof(*zonedev), GFP_KERNEL);
if (!zonedev)
diff --git a/drivers/usb/gadget/function/f_midi2.c b/drivers/usb/gadget/function/f_midi2.c
index 12e866fb311d..0a800ba53816 100644
--- a/drivers/usb/gadget/function/f_midi2.c
+++ b/drivers/usb/gadget/function/f_midi2.c
@@ -475,7 +475,7 @@ static void reply_ump_stream_ep_info(struct f_midi2_ep *ep)
/* reply a UMP EP device info */
static void reply_ump_stream_ep_device(struct f_midi2_ep *ep)
{
- struct snd_ump_stream_msg_devince_info rep = {
+ struct snd_ump_stream_msg_device_info rep = {
.type = UMP_MSG_TYPE_STREAM,
.status = UMP_STREAM_MSG_STATUS_DEVICE_INFO,
.manufacture_id = ep->info.manufacturer,
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index d36f3b6992bb..152ee3376550 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -1056,13 +1056,20 @@ int usb_stor_probe1(struct us_data **pus,
goto BadDevice;
/*
- * Some USB host controllers can't do DMA; they have to use PIO.
- * For such controllers we need to make sure the block layer sets
- * up bounce buffers in addressable memory.
+ * Some USB host controllers can't do DMA: They have to use PIO, or they
+ * have to use a small dedicated local memory area, or they have other
+ * restrictions on addressable memory.
+ *
+ * We can't support these controllers on highmem systems as we don't
+ * kmap or bounce buffer.
*/
- if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
- bus_to_hcd(us->pusb_dev->bus)->localmem_pool)
- host->no_highmem = true;
+ if (IS_ENABLED(CONFIG_HIGHMEM) &&
+ (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
+ bus_to_hcd(us->pusb_dev->bus)->localmem_pool)) {
+ dev_warn(&intf->dev, "USB Mass Storage not supported on this host controller\n");
+ result = -EINVAL;
+ goto release;
+ }
/* Get the unusual_devs entries and the descriptors */
result = get_device_info(us, id, unusual_dev);
@@ -1081,6 +1088,7 @@ int usb_stor_probe1(struct us_data **pus,
BadDevice:
usb_stor_dbg(us, "storage_probe() failed\n");
+release:
release_everything(us);
return result;
}
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 32619d146cbc..1286d96a29bc 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -164,4 +164,5 @@ const struct address_space_operations v9fs_addr_operations = {
.invalidate_folio = netfs_invalidate_folio,
.direct_IO = noop_direct_IO,
.writepages = netfs_writepages,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9e7b1fe82c27..bfb69e066672 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -943,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry)
}
strcpy(p, name);
- ret = lookup_one_len(buf, dentry->d_parent, len);
+ ret = lookup_noperm(&QSTR(buf), dentry->d_parent);
if (IS_ERR(ret) || d_is_positive(ret))
goto out_s;
dput(ret);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index a1e581946b93..0b80eb93fa40 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -113,16 +113,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
sdentry = NULL;
do {
- int slen;
-
dput(sdentry);
sillycounter++;
/* Create a silly name. Note that the ".__afs" prefix is
* understood by the salvager and must not be changed.
*/
- slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
+ sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
/* N.B. Better to return EBUSY here ... it could be dangerous
* to delete the file while it's in use.
diff --git a/fs/aio.c b/fs/aio.c
index 7b976b564cfc..793b7b15ec4b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
+ req->ki_write_stream = 0;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 583ac81669c2..e51e7d88980a 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,51 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
+ return 0;
+}
+
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct inode_operations anon_inode_operations = {
+ .getattr = anon_inode_getattr,
+ .setattr = anon_inode_setattr,
+};
+
+/*
* anon_inodefs_dname() is called from d_path().
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}
@@ -66,6 +109,7 @@ static struct inode *anon_inode_make_secure_inode(
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
+ inode->i_op = &anon_inode_operations;
error = security_inode_init_security_anon(inode, &QSTR(name),
context_inode);
if (error) {
@@ -313,6 +357,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+ anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index c5a6aae12d2c..d8dd150cbd74 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -459,7 +459,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
"the parent autofs mount timeout which could "
"prevent shutdown\n");
- dentry = try_lookup_one_len(param->path, base, path_len);
+ dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len),
+ base);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ff26bb515150..5f195d2280a4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
static int backpointer_target_not_found(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct bkey_s_c target_k,
- struct bkey_buf *last_flushed)
+ struct bkey_buf *last_flushed,
+ bool commit)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
@@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans,
}
if (fsck_err(trans, backpointer_to_missing_ptr,
- "%s", buf.buf))
+ "%s", buf.buf)) {
ret = bch2_backpointer_del(trans, bp.k->p);
+ if (ret || !commit)
+ goto out;
+
+ /*
+ * Normally, on transaction commit from inside a transaction,
+ * we'll return -BCH_ERR_transaction_restart_nested, since a
+ * transaction commit invalidates pointers given out by peek().
+ *
+ * However, since we're updating a write buffer btree, if we
+ * return a transaction restart and loop we won't see that the
+ * backpointer has been deleted without an additional write
+ * buffer flush - and those are expensive.
+ *
+ * So we're relying on the caller immediately advancing to the
+ * next backpointer and starting a new transaction immediately
+ * after backpointer_get_key() returns NULL:
+ */
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ }
+out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- unsigned iter_flags,
- struct bkey_buf *last_flushed)
+static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ struct bkey_buf *last_flushed,
+ bool commit)
+{
+ struct bch_fs *c = trans->c;
+
+ BUG_ON(!bp.v->level);
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.v->btree_id,
+ bp.v->pos,
+ 0,
+ bp.v->level - 1,
+ 0);
+ struct btree *b = bch2_btree_iter_peek_node(trans, iter);
+ if (IS_ERR_OR_NULL(b))
+ goto err;
+
+ BUG_ON(b->c.level != bp.v->level - 1);
+
+ if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
+ bkey_i_to_s_c(&b->key), bp))
+ return b;
+
+ if (btree_node_will_make_reachable(b)) {
+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ } else {
+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
+ last_flushed, commit);
+ b = ret ? ERR_PTR(ret) : NULL;
+ }
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return b;
+}
+
+static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed,
+ bool commit)
{
struct bch_fs *c = trans->c;
@@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
bch2_trans_iter_exit(trans, iter);
if (!bp.v->level) {
- int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
- struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
+ struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
return bkey_s_c_null;
if (IS_ERR_OR_NULL(b))
@@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_buf *last_flushed)
{
- struct bch_fs *c = trans->c;
-
- BUG_ON(!bp.v->level);
-
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0,
- bp.v->level - 1,
- 0);
- struct btree *b = bch2_btree_iter_peek_node(trans, iter);
- if (IS_ERR_OR_NULL(b))
- goto err;
-
- BUG_ON(b->c.level != bp.v->level - 1);
-
- if (extent_matches_bp(c, bp.v->btree_id, bp.v->level,
- bkey_i_to_s_c(&b->key), bp))
- return b;
+ return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
+}
- if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
- } else {
- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
- b = ret ? ERR_PTR(ret) : NULL;
- }
-err:
- bch2_trans_iter_exit(trans, iter);
- return b;
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed)
+{
+ return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
}
static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
@@ -521,7 +562,7 @@ check_existing_bp:
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
struct bkey_s_c other_extent =
- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
+ __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9b80201c7982..899891295797 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -852,7 +852,6 @@ out:
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
bch2_btree_keys_init(b);
- set_btree_node_accessed(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
@@ -1286,6 +1285,10 @@ lock_node:
six_unlock_read(&b->c.lock);
goto retry;
}
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
}
/* XXX: waiting on IO with btree locks held: */
@@ -1301,10 +1304,6 @@ lock_node:
prefetch(p + L1_CACHE_BYTES * 2);
}
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
-
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 59fa527ac685..ac5f2046550d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1162,7 +1162,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
}
if (path->cached) {
- ret = bch2_btree_path_traverse_cached(trans, path, flags);
+ ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
goto out;
}
@@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
return NULL;
}
+ /*
+ * We don't correctly handle nodes with extra intent locks here:
+ * downgrade so we don't violate locking invariants
+ */
+ bch2_btree_path_downgrade(trans, path);
+
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
__bch2_btree_path_unlock(trans, path);
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
@@ -2743,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ goto out;
}
/* extents can't span inode numbers: */
@@ -2763,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ goto out;
}
struct btree_path *path = btree_iter_path(trans, iter);
if (unlikely(!btree_path_node(path, path->level)))
return bkey_s_c_null;
+ btree_path_set_should_be_locked(trans, path);
+
if ((iter->flags & BTREE_ITER_cached) ||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
@@ -2790,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
- goto out_no_locked;
+ goto out;
}
- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
if (unlikely(!k.k))
- goto out_no_locked;
+ goto out;
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
(iter->flags & BTREE_ITER_filter_snapshots) &&
@@ -2833,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
}
if (unlikely(bkey_err(k)))
- goto out_no_locked;
+ goto out;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
@@ -2855,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre
}
}
out:
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_iter_verify_ret(trans, iter, k);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 2b186584a291..669825f89cdd 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -301,9 +301,11 @@ static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans
}
static noinline int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
+ btree_path_idx_t ck_path_idx,
unsigned flags)
{
+ struct btree_path *ck_path = trans->paths + ck_path_idx;
+
if (flags & BTREE_ITER_cached_nofill) {
ck_path->l[0].b = NULL;
return 0;
@@ -325,6 +327,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
goto err;
/* Recheck after btree lookup, before allocating: */
+ ck_path = trans->paths + ck_path_idx;
ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
if (unlikely(ret))
goto out;
@@ -344,10 +347,11 @@ err:
}
static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
- struct btree_path *path)
+ btree_path_idx_t path_idx)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
+ struct btree_path *path = trans->paths + path_idx;
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck)
@@ -373,27 +377,32 @@ retry:
return 0;
}
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+int bch2_btree_path_traverse_cached(struct btree_trans *trans,
+ btree_path_idx_t path_idx,
unsigned flags)
{
- EBUG_ON(path->level);
-
- path->l[1].b = NULL;
+ EBUG_ON(trans->paths[path_idx].level);
int ret;
do {
- ret = btree_path_traverse_cached_fast(trans, path);
+ ret = btree_path_traverse_cached_fast(trans, path_idx);
if (unlikely(ret == -ENOENT))
- ret = btree_key_cache_fill(trans, path, flags);
+ ret = btree_key_cache_fill(trans, path_idx, flags);
} while (ret == -EEXIST);
+ struct btree_path *path = trans->paths + path_idx;
+
if (unlikely(ret)) {
path->uptodate = BTREE_ITER_NEED_TRAVERSE;
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(ret);
}
+ } else {
+ BUG_ON(path->uptodate);
+ BUG_ON(!path->nodes_locked);
}
+
return ret;
}
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 51d6289b8dee..82d8c72512a9 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -40,8 +40,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
- unsigned);
+int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
struct btree_insert_entry *);
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 8a680e52c1ed..a51195088227 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -288,6 +288,7 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+ const struct bch_hash_info *hash_info,
subvol_inum dir,
u8 type,
const struct qstr *name,
@@ -295,10 +296,19 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
u64 dst)
{
struct bkey_i_dirent *dirent;
+ struct qstr _cf_name;
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
+ if (hash_info->cf_encoding && !cf_name) {
+ int ret = bch2_casefold(trans, hash_info, name, &_cf_name);
+ if (ret)
+ return ERR_PTR(ret);
+
+ cf_name = &_cf_name;
+ }
+
dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
if (IS_ERR(dirent))
return dirent;
@@ -324,7 +334,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
+ dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -333,8 +343,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot, &dirent->k_i,
- flags|BTREE_UPDATE_internal_snapshot_node);
+ dir_inum, snapshot, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -344,28 +353,16 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
- u64 *i_size,
enum btree_iter_update_trigger_flags flags)
{
struct bkey_i_dirent *dirent;
int ret;
- if (hash_info->cf_encoding) {
- struct qstr cf_name;
- ret = bch2_casefold(trans, hash_info, name, &cf_name);
- if (ret)
- return ret;
- dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
- } else {
- dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
- }
-
+ dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
- *i_size += bkey_bytes(&dirent->k);
-
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
@@ -466,7 +463,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
+ new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
@@ -477,7 +474,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_dir, 0, src_name,
+ new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name,
src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 9838a7ba7ed1..d3e7ae669575 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -65,7 +65,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
enum btree_iter_update_trigger_flags);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, u64 *,
+ const struct qstr *, u64, u64 *,
enum btree_iter_update_trigger_flags);
static inline unsigned vfs_d_type(unsigned type)
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b007319b72e9..1f0422bfae35 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
return ret;
}
+int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
+{
+ struct bch_replicas_padded r;
+
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
+ !bch2_replicas_marked_locked(c, &r.e))
+ return -BCH_ERR_btree_insert_need_mark_replicas;
+
+ return __bch2_accounting_mem_insert(c, a);
+}
+
static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
{
for (unsigned i = 0; i < e->nr_counters; i++)
@@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
bch2_accounting_mem_mod_locked(trans,
bkey_i_to_s_c_accounting(&k_i.k),
- BCH_ACCOUNTING_normal);
+ BCH_ACCOUNTING_normal, true);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
- BCH_ACCOUNTING_read);
+ BCH_ACCOUNTING_read, false);
percpu_up_read(&c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index abb1f6206fe9..d557b99b3c0a 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -136,6 +136,7 @@ enum bch_accounting_mode {
};
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
+int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
@@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
*/
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
+ enum bch_accounting_mode mode,
+ bool write_locked)
{
struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
@@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, mode);
+ int ret = 0;
+ if (unlikely(write_locked))
+ ret = bch2_accounting_mem_insert_locked(c, a, mode);
+ else
+ ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
@@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
EBUG_ON(bversion_zero(a->k.bversion));
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
: 0;
}
@@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
struct bkey_s_accounting a = accounting_i_to_s(a_i);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
bch2_accounting_neg(a);
}
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index fff58b78327c..c6cb26981923 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -507,20 +507,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
{
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
-
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
- break;
- }
- }
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
return false;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index e78a39e7e18f..9fe153183b36 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -380,13 +380,6 @@ out: \
/* Iterate over pointers in KEY_TYPE_extent: */
-#define extent_for_each_entry_from(_e, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, \
- extent_entry_last(_e), _entry)
-
-#define extent_for_each_entry(_e, _entry) \
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
-
#define extent_ptr_next(_e, _ptr) \
__bkey_ptr_next(_ptr, extent_entry_last(_e))
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index e072900e6a5b..fbae9c1de746 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -605,10 +605,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
- unsigned len;
- loff_t isize;
vm_fault_t ret;
+ loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
+ unsigned offset = file_offset - folio_pos(folio);
+ unsigned len = max(PAGE_SIZE, block_bytes(c));
+
+ BUG_ON(offset + len > folio_size(folio));
+
bch2_folio_reservation_init(c, inode, &res);
sb_start_pagefault(inode->v.i_sb);
@@ -623,24 +627,24 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
bch2_pagecache_add_get(inode);
folio_lock(folio);
- isize = i_size_read(&inode->v);
+ u64 isize = i_size_read(&inode->v);
- if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+ if (folio->mapping != mapping || file_offset >= isize) {
folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out;
}
- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+ len = min_t(unsigned, len, isize - file_offset);
if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+ bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
folio_unlock(folio);
ret = VM_FAULT_SIGBUS;
goto out;
}
- bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+ bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
bch2_folio_reservation_put(c, inode, &res);
folio_wait_stable(folio);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index b6801861c66f..47f1a64c5c8d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1429,7 +1429,9 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
+ u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
+
+ ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
if (ret)
goto err;
@@ -1662,33 +1664,9 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans,
return -EINVAL;
if (s->casefold != bch2_inode_casefold(c, bi)) {
-#ifdef CONFIG_UNICODE
- int ret = 0;
- /* Not supported on individual files. */
- if (!S_ISDIR(bi->bi_mode))
- return -EOPNOTSUPP;
-
- /*
- * Make sure the dir is empty, as otherwise we'd need to
- * rehash everything and update the dirent keys.
- */
- ret = bch2_empty_dir_trans(trans, inode_inum(inode));
- if (ret < 0)
- return ret;
-
- ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
if (ret)
return ret;
-
- bch2_check_set_feature(c, BCH_FEATURE_casefolding);
-
- bi->bi_casefold = s->casefold + 1;
- bi->bi_fields_set |= BIT(Inode_opt_casefold);
-
-#else
- printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
- return -EOPNOTSUPP;
-#endif
}
if (s->set_project) {
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 7b25cedd3e40..aaf187085276 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -306,6 +306,7 @@ create_lostfound:
&lostfound_str,
lostfound->bi_inum,
&lostfound->bi_dir_offset,
+ BTREE_UPDATE_internal_snapshot_node|
STR_HASH_must_create) ?:
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
BTREE_UPDATE_internal_snapshot_node);
@@ -431,6 +432,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
&name,
inode->bi_subvol ?: inode->bi_inum,
&inode->bi_dir_offset,
+ BTREE_UPDATE_internal_snapshot_node|
STR_HASH_must_create);
if (ret) {
bch_err_msg(c, ret, "error creating dirent");
@@ -2188,6 +2190,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ /* check casefold */
+ if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
+ trans, dirent_casefold_mismatch,
+ "dirent casefold does not match dir casefold\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ struct qstr name = bch2_dirent_get_name(d);
+ u32 subvol = d.v->d_type == DT_SUBVOL
+ ? d.v->d_parent_subvol
+ : 0;
+ u64 target = d.v->d_type == DT_SUBVOL
+ ? d.v->d_child_subvol
+ : d.v->d_inum;
+ u64 dir_offset;
+
+ ret = bch2_hash_delete_at(trans,
+ bch2_dirent_hash_desc, hash_info, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_dirent_create_snapshot(trans, subvol,
+ d.k->p.inode, d.k->p.snapshot,
+ hash_info,
+ d.v->d_type,
+ &name,
+ target,
+ &dir_offset,
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node|
+ STR_HASH_must_create) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+
+ /* might need another check_dirents pass */
+ goto out;
+ }
+
if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
@@ -2446,7 +2483,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
- if (fsck_err(c, subvol_loop, "subvolume loop"))
+ if (fsck_err(trans, subvol_loop, "subvolume loop"))
ret = reattach_subvol(trans, s);
break;
}
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index b51d98cf8a80..490b85841de9 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -14,6 +14,7 @@
#include "extent_update.h"
#include "fs.h"
#include "inode.h"
+#include "namei.h"
#include "opts.h"
#include "str_hash.h"
#include "snapshot.h"
@@ -1204,6 +1205,41 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
return 0;
}
+int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *bi, unsigned v)
+{
+ struct bch_fs *c = trans->c;
+
+#ifdef CONFIG_UNICODE
+ int ret = 0;
+ /* Not supported on individual files. */
+ if (!S_ISDIR(bi->bi_mode))
+ return -EOPNOTSUPP;
+
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret < 0)
+ return ret;
+
+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
+ if (ret)
+ return ret;
+
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
+
+ bi->bi_casefold = v + 1;
+ bi->bi_fields_set |= BIT(Inode_opt_casefold);
+
+ return 0;
+#else
+ bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
+ return -EOPNOTSUPP;
+#endif
+}
+
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index c74af15b14f2..5cfba9e98966 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -292,7 +292,9 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
+int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *, unsigned);
#include "rebalance.h"
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 976464d8a695..cc00b0fc40d8 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -17,6 +17,8 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j)
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
- if (ja->discard_idx != ja->dirty_idx_ondisk)
- can_discard = true;
+ can_discard |= __should_discard_bucket(j, ja);
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
@@ -264,13 +265,19 @@ out:
/* Discards - last part of journal reclaim: */
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
{
- spin_lock(&j->lock);
unsigned min_free = max(4, ja->nr / 8);
- bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free &&
+ return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
+ min_free &&
ja->discard_idx != ja->dirty_idx_ondisk;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ spin_lock(&j->lock);
+ bool ret = __should_discard_bucket(j, ja);
spin_unlock(&j->lock);
return ret;
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index 52c58c6d53d2..9136a9097789 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -158,7 +158,6 @@ int bch2_create_trans(struct btree_trans *trans,
name,
dir_target,
&dir_offset,
- &dir_u->bi_size,
STR_HASH_must_create|BTREE_ITER_with_updates) ?:
bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
@@ -225,7 +224,6 @@ int bch2_link_trans(struct btree_trans *trans,
mode_to_type(inode_u->bi_mode),
name, inum.inum,
&dir_offset,
- &dir_u->bi_size,
STR_HASH_must_create);
if (ret)
goto err;
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 4ccdfc1f34aa..623273556aa9 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
- if (!bch2_bkey_rebalance_opts(k))
+ if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
return 0;
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 3b69a924086f..4036a20c6adc 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -209,6 +209,7 @@ enum bch_fsck_flags {
x(subvol_to_missing_root, 188, 0) \
x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
x(bkey_in_missing_snapshot, 190, 0) \
+ x(bkey_in_deleted_snapshot, 315, 0) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
x(inode_alloc_cursor_inode_bad, 301, 0) \
@@ -216,6 +217,7 @@ enum bch_fsck_flags {
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
x(inode_snapshot_mismatch, 196, 0) \
+ x(snapshot_key_missing_inode_snapshot, 314, 0) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
x(inode_unlinked_and_not_open, 281, 0) \
@@ -237,6 +239,8 @@ enum bch_fsck_flags {
x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \
+ x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \
+ x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
@@ -262,6 +266,7 @@ enum bch_fsck_flags {
x(dirent_to_overwritten_inode, 302, 0) \
x(dirent_to_missing_subvol, 230, 0) \
x(dirent_to_itself, 231, 0) \
+ x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \
x(quota_type_invalid, 232, 0) \
x(xattr_val_size_too_small, 233, 0) \
x(xattr_val_size_too_big, 234, 0) \
@@ -301,6 +306,7 @@ enum bch_fsck_flags {
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \
+ x(subvol_missing, 308, FSCK_AUTOFIX) \
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
@@ -322,7 +328,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 314, 0)
+ x(MAX, 319, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 651da52b2cbc..e6be32003f3b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -473,6 +473,12 @@ static int inode_opt_set_fn(struct btree_trans *trans,
{
struct inode_opt_set *s = p;
+ if (s->id == Inode_opt_casefold) {
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
+ if (ret)
+ return ret;
+ }
+
if (s->defined)
bi->bi_fields_set |= 1U << s->id;
else
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index db81570c9637..1d41ce477df5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/fs_context.h>
#include "bfs.h"
MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
@@ -305,7 +306,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
#endif
}
-static int bfs_fill_super(struct super_block *s, void *data, int silent)
+static int bfs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
@@ -314,6 +315,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
+ int silent = fc->sb_flags & SB_SILENT;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
@@ -446,18 +448,28 @@ out:
return ret;
}
-static struct dentry *bfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+ return get_tree_bdev(fc, bfs_fill_super);
+}
+
+static const struct fs_context_operations bfs_context_ops = {
+ .get_tree = bfs_get_tree,
+};
+
+static int bfs_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &bfs_context_ops;
+
+ return 0;
}
static struct file_system_type bfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "bfs",
- .mount = bfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "bfs",
+ .init_fs_context = bfs_init_fs_context,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("bfs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4c1ea6b52a53..a43363d593e5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -68,12 +68,6 @@
static int load_elf_binary(struct linux_binprm *bprm);
-#ifdef CONFIG_USELIB
-static int load_elf_library(struct file *);
-#else
-#define load_elf_library NULL
-#endif
-
/*
* If we don't support core dumping, then supply a NULL so we
* don't even try.
@@ -101,7 +95,6 @@ static int elf_core_dump(struct coredump_params *cprm);
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
- .load_shlib = load_elf_library,
#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
@@ -1384,75 +1377,6 @@ out_free_ph:
goto out;
}
-#ifdef CONFIG_USELIB
-/* This is really simpleminded and specialized - we are loading an
- a.out library that is given an ELF header. */
-static int load_elf_library(struct file *file)
-{
- struct elf_phdr *elf_phdata;
- struct elf_phdr *eppnt;
- int retval, error, i, j;
- struct elfhdr elf_ex;
-
- error = -ENOEXEC;
- retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
- if (retval < 0)
- goto out;
-
- if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
- goto out;
-
- /* First of all, some simple consistency checks */
- if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
- !elf_check_arch(&elf_ex) || !file->f_op->mmap)
- goto out;
- if (elf_check_fdpic(&elf_ex))
- goto out;
-
- /* Now read in all of the header information */
-
- j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
- /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
-
- error = -ENOMEM;
- elf_phdata = kmalloc(j, GFP_KERNEL);
- if (!elf_phdata)
- goto out;
-
- eppnt = elf_phdata;
- error = -ENOEXEC;
- retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
- if (retval < 0)
- goto out_free_ph;
-
- for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
- if ((eppnt + i)->p_type == PT_LOAD)
- j++;
- if (j != 1)
- goto out_free_ph;
-
- while (eppnt->p_type != PT_LOAD)
- eppnt++;
-
- /* Now use mmap to map the library into memory. */
- error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
- eppnt,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED_NOREPLACE | MAP_PRIVATE,
- 0);
-
- if (error != ELF_PAGESTART(eppnt->p_vaddr))
- goto out_free_ph;
-
- error = 0;
-
-out_free_ph:
- kfree(elf_phdata);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_ELF_CORE
/*
* ELF core dumper
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 5a7ebd160724..432fbf4fc334 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -842,7 +842,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
}
inode_lock(d_inode(root));
- dentry = lookup_one_len(e->name, root, strlen(e->name));
+ dentry = lookup_noperm(&QSTR(e->name), root);
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a498fe524c90..913acef3f0a9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -909,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (error == -EINTR)
return error;
- dentry = lookup_one(idmap, name, parent->dentry, namelen);
+ dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
@@ -2288,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
- int subvol_namelen;
int ret = 0;
bool destroy_parent = false;
@@ -2411,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
}
- subvol_namelen = strlen(subvol_name);
-
if (strchr(subvol_name, '/') ||
- strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ strcmp(subvol_name, "..") == 0) {
ret = -EINVAL;
goto free_subvol_name;
}
@@ -2427,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (ret == -EINTR)
goto free_subvol_name;
- dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
+ dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out_unlock_dir;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ed120621021b..ce36fafc771e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2882,17 +2882,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
struct page *page, u64 physical, u64 generation)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct bio_vec bvec;
- struct bio bio;
struct btrfs_super_block *sb = page_address(page);
int ret;
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
- ret = submit_bio_wait(&bio);
- bio_uninit(&bio);
-
+ ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
+ BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);
diff --git a/fs/buffer.c b/fs/buffer.c
index 7ba1807145aa..8cf4a1dc481e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -297,7 +297,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
struct postprocess_bh_ctx {
@@ -422,7 +421,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
/*
@@ -1122,6 +1120,8 @@ static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
+ bool blocking = gfpflags_allow_blocking(gfp);
+
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
@@ -1137,12 +1137,15 @@ __getblk_slow(struct block_device *bdev, sector_t block,
for (;;) {
struct buffer_head *bh;
- bh = __find_get_block(bdev, block, size);
- if (bh)
- return bh;
-
if (!grow_buffers(bdev, block, size, gfp))
return NULL;
+
+ if (blocking)
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
+ if (bh)
+ return bh;
}
}
@@ -1611,8 +1614,8 @@ static void discard_buffer(struct buffer_head * bh)
bh->b_bdev = NULL;
b_state = READ_ONCE(bh->b_state);
do {
- } while (!try_cmpxchg(&bh->b_state, &b_state,
- b_state & ~BUFFER_FLAGS_DISCARD));
+ } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1677,7 +1680,6 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
filemap_release_folio(folio, 0);
out:
folio_clear_mappedtodisk(folio);
- return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2728,7 +2730,7 @@ unlock:
EXPORT_SYMBOL(block_truncate_page);
/*
- * The generic ->writepage function for buffer-backed address_spaces
+ * The generic write folio function for buffer-backed address_spaces
*/
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
void *get_block)
@@ -2748,7 +2750,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
/*
* The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
+ * writeback invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 38c236e38cef..b62cd3e9a18e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -71,7 +71,6 @@ struct cachefiles_object {
int debug_id;
spinlock_t lock;
refcount_t ref;
- u8 d_name_len; /* Length of filename */
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index b48525680e73..aae86af48ed5 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -132,7 +132,6 @@ bool cachefiles_cook_key(struct cachefiles_object *object)
success:
name[len] = 0;
object->d_name = name;
- object->d_name_len = len;
_leave(" = %s", object->d_name);
return true;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14d0cc894000..aecfc5c37b49 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -98,7 +98,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
retry:
ret = cachefiles_inject_read_error();
if (ret == 0)
- subdir = lookup_one_len(dirname, dir, strlen(dirname));
+ subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir);
else
subdir = ERR_PTR(ret);
trace_cachefiles_lookup(NULL, dir, subdir);
@@ -338,7 +338,7 @@ try_again:
return -EIO;
}
- grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+ grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard);
if (IS_ERR(grave)) {
unlock_rename(cache->graveyard, dir);
trace_cachefiles_vfs_error(object, d_inode(cache->graveyard),
@@ -630,8 +630,8 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
/* Look up path "cache/vol/fanout/file". */
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_positive_unlocked(object->d_name, fan,
- object->d_name_len);
+ dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
trace_cachefiles_lookup(object, fan, dentry);
@@ -683,7 +683,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
@@ -702,7 +702,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
dput(dentry);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
@@ -751,7 +751,7 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- victim = lookup_one_len(filename, dir, strlen(filename));
+ victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir);
if (IS_ERR(victim))
goto lookup_error;
if (d_is_negative(victim))
diff --git a/fs/coredump.c b/fs/coredump.c
index c33c177a701b..f217ebf2b3b6 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -43,6 +43,14 @@
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
#include <linux/elf.h>
+#include <linux/pidfs.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/af_unix.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
+#include <uapi/linux/un.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -60,6 +68,12 @@ static void free_vma_snapshot(struct coredump_params *cprm);
#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
/* Define a reasonable max cap */
#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
+/*
+ * File descriptor number for the pidfd for the thread-group leader of
+ * the coredumping task installed into the usermode helper's file
+ * descriptor table.
+ */
+#define COREDUMP_PIDFD_NUMBER 3
static int core_uses_pid;
static unsigned int core_pipe_limit;
@@ -68,9 +82,16 @@ static char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
+enum coredump_type_t {
+ COREDUMP_FILE = 1,
+ COREDUMP_PIPE = 2,
+ COREDUMP_SOCK = 3,
+};
+
struct core_name {
char *corename;
int used, size;
+ enum coredump_type_t core_type;
};
static int expand_corename(struct core_name *cn, int size)
@@ -210,18 +231,24 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
- int ispipe = (*pat_ptr == '|');
bool was_space = false;
int pid_in_pattern = 0;
int err = 0;
cn->used = 0;
cn->corename = NULL;
+ if (*pat_ptr == '|')
+ cn->core_type = COREDUMP_PIPE;
+ else if (*pat_ptr == '@')
+ cn->core_type = COREDUMP_SOCK;
+ else
+ cn->core_type = COREDUMP_FILE;
if (expand_corename(cn, core_name_size))
return -ENOMEM;
cn->corename[0] = '\0';
- if (ispipe) {
+ switch (cn->core_type) {
+ case COREDUMP_PIPE: {
int argvs = sizeof(core_pattern) / 2;
(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
if (!(*argv))
@@ -230,6 +257,45 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
++pat_ptr;
if (!(*pat_ptr))
return -ENOMEM;
+ break;
+ }
+ case COREDUMP_SOCK: {
+ /* skip the @ */
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return -ENOMEM;
+
+ err = cn_printf(cn, "%s", pat_ptr);
+ if (err)
+ return err;
+
+ /* Require absolute paths. */
+ if (cn->corename[0] != '/')
+ return -EINVAL;
+
+ /*
+ * Ensure we can uses spaces to indicate additional
+ * parameters in the future.
+ */
+ if (strchr(cn->corename, ' ')) {
+ coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename);
+ return -EINVAL;
+ }
+
+ /*
+ * Currently no need to parse any other options.
+ * Relevant information can be retrieved from the peer
+ * pidfd retrievable via SO_PEERPIDFD by the receiver or
+ * via /proc/<pid>, using the SO_PEERPIDFD to guard
+ * against pid recycling when opening /proc/<pid>.
+ */
+ return 0;
+ }
+ case COREDUMP_FILE:
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return -EINVAL;
}
/* Repeat as long as we have more pattern to process and more output
@@ -239,7 +305,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
* Split on spaces before doing template expansion so that
* %e and %E don't get split if they have spaces in them
*/
- if (ispipe) {
+ if (cn->core_type == COREDUMP_PIPE) {
if (isspace(*pat_ptr)) {
if (cn->used != 0)
was_space = true;
@@ -339,6 +405,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
case 'C':
err = cn_printf(cn, "%d", cprm->cpu);
break;
+ /* pidfd number */
+ case 'F': {
+ /*
+ * Installing a pidfd only makes sense if
+ * we actually spawn a usermode helper.
+ */
+ if (cn->core_type != COREDUMP_PIPE)
+ break;
+
+ /*
+ * Note that we'll install a pidfd for the
+ * thread-group leader. We know that task
+ * linkage hasn't been removed yet and even if
+ * this @current isn't the actual thread-group
+ * leader we know that the thread-group leader
+ * cannot be reaped until @current has exited.
+ */
+ cprm->pid = task_tgid(current);
+ err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
+ break;
+ }
default:
break;
}
@@ -355,12 +442,10 @@ out:
* If core_pattern does not include a %p (as is the default)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
- if (!ispipe && !pid_in_pattern && core_uses_pid) {
- err = cn_printf(cn, ".%d", task_tgid_vnr(current));
- if (err)
- return err;
- }
- return ispipe;
+ if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid)
+ return cn_printf(cn, ".%d", task_tgid_vnr(current));
+
+ return 0;
}
static int zap_process(struct signal_struct *signal, int exit_code)
@@ -493,7 +578,7 @@ static void wait_for_dump_helpers(struct file *file)
}
/*
- * umh_pipe_setup
+ * umh_coredump_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
@@ -503,11 +588,34 @@ static void wait_for_dump_helpers(struct file *file)
* is a special value that we use to trap recursive
* core dumps
*/
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
+ int err;
+
+ if (cp->pid) {
+ struct file *pidfs_file __free(fput) = NULL;
+
+ pidfs_file = pidfs_alloc_file(cp->pid, 0);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
+
+ pidfs_coredump(cp);
+
+ /*
+ * Usermode helpers are childen of either
+ * system_unbound_wq or of kthreadd. So we know that
+ * we're starting off with a clean file descriptor
+ * table. So we should always be able to use
+ * COREDUMP_PIDFD_NUMBER as our file descriptor value.
+ */
+ err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
+ if (err < 0)
+ return err;
+ }
+
+ err = create_pipe_files(files, 0);
if (err)
return err;
@@ -515,10 +623,13 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
err = replace_fd(0, files[0], 0);
fput(files[0]);
+ if (err < 0)
+ return err;
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
- return err;
+ return 0;
}
void do_coredump(const kernel_siginfo_t *siginfo)
@@ -530,7 +641,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
- int ispipe;
size_t *argv = NULL;
int argc = 0;
/* require nonrelative corefile path and be extra careful */
@@ -579,70 +689,14 @@ void do_coredump(const kernel_siginfo_t *siginfo)
old_cred = override_creds(cred);
- ispipe = format_corename(&cn, &cprm, &argv, &argc);
-
- if (ispipe) {
- int argi;
- int dump_count;
- char **helper_argv;
- struct subprocess_info *sub_info;
-
- if (ispipe < 0) {
- coredump_report_failure("format_corename failed, aborting core");
- goto fail_unlock;
- }
-
- if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
- *
- * Normally core limits are irrelevant to pipes, since
- * we're not writing to the file system, but we use
- * cprm.limit of 1 here as a special value, this is a
- * consistent way to catch recursive crashes.
- * We can still crash if the core_pattern binary sets
- * RLIM_CORE = !1, but it runs as root, and can do
- * lots of stupid things.
- *
- * Note that we use task_tgid_vnr here to grab the pid
- * of the process group leader. That way we get the
- * right pid if a thread in a multi-threaded
- * core_pattern process dies.
- */
- coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
- goto fail_unlock;
- }
- cprm.limit = RLIM_INFINITY;
-
- dump_count = atomic_inc_return(&core_dump_count);
- if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- coredump_report_failure("over core_pipe_limit, skipping core dump");
- goto fail_dropcount;
- }
-
- helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
- GFP_KERNEL);
- if (!helper_argv) {
- coredump_report_failure("%s failed to allocate memory", __func__);
- goto fail_dropcount;
- }
- for (argi = 0; argi < argc; argi++)
- helper_argv[argi] = cn.corename + argv[argi];
- helper_argv[argi] = NULL;
-
- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
+ retval = format_corename(&cn, &cprm, &argv, &argc);
+ if (retval < 0) {
+ coredump_report_failure("format_corename failed, aborting core");
+ goto fail_unlock;
+ }
- kfree(helper_argv);
- if (retval) {
- coredump_report_failure("|%s pipe failed", cn.corename);
- goto close_fail;
- }
- } else {
+ switch (cn.core_type) {
+ case COREDUMP_FILE: {
struct mnt_idmap *idmap;
struct inode *inode;
int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
@@ -736,6 +790,143 @@ void do_coredump(const kernel_siginfo_t *siginfo)
if (do_truncate(idmap, cprm.file->f_path.dentry,
0, 0, cprm.file))
goto close_fail;
+ break;
+ }
+ case COREDUMP_PIPE: {
+ int argi;
+ int dump_count;
+ char **helper_argv;
+ struct subprocess_info *sub_info;
+
+ if (cprm.limit == 1) {
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
+ *
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * cprm.limit of 1 here as a special value, this is a
+ * consistent way to catch recursive crashes.
+ * We can still crash if the core_pattern binary sets
+ * RLIM_CORE = !1, but it runs as root, and can do
+ * lots of stupid things.
+ *
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
+ goto fail_unlock;
+ }
+ cprm.limit = RLIM_INFINITY;
+
+ dump_count = atomic_inc_return(&core_dump_count);
+ if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+ coredump_report_failure("over core_pipe_limit, skipping core dump");
+ goto fail_dropcount;
+ }
+
+ helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
+ GFP_KERNEL);
+ if (!helper_argv) {
+ coredump_report_failure("%s failed to allocate memory", __func__);
+ goto fail_dropcount;
+ }
+ for (argi = 0; argi < argc; argi++)
+ helper_argv[argi] = cn.corename + argv[argi];
+ helper_argv[argi] = NULL;
+
+ retval = -ENOMEM;
+ sub_info = call_usermodehelper_setup(helper_argv[0],
+ helper_argv, NULL, GFP_KERNEL,
+ umh_coredump_setup, NULL, &cprm);
+ if (sub_info)
+ retval = call_usermodehelper_exec(sub_info,
+ UMH_WAIT_EXEC);
+
+ kfree(helper_argv);
+ if (retval) {
+ coredump_report_failure("|%s pipe failed", cn.corename);
+ goto close_fail;
+ }
+ break;
+ }
+ case COREDUMP_SOCK: {
+#ifdef CONFIG_UNIX
+ struct file *file __free(fput) = NULL;
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ ssize_t addr_len;
+ struct socket *socket;
+
+ addr_len = strscpy(addr.sun_path, cn.corename);
+ if (addr_len < 0)
+ goto close_fail;
+ addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
+
+ /*
+ * It is possible that the userspace process which is
+ * supposed to handle the coredump and is listening on
+ * the AF_UNIX socket coredumps. Userspace should just
+ * mark itself non dumpable.
+ */
+
+ retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
+ if (retval < 0)
+ goto close_fail;
+
+ file = sock_alloc_file(socket, 0, NULL);
+ if (IS_ERR(file))
+ goto close_fail;
+
+ /*
+ * Set the thread-group leader pid which is used for the
+ * peer credentials during connect() below. Then
+ * immediately register it in pidfs...
+ */
+ cprm.pid = task_tgid(current);
+ retval = pidfs_register_pid(cprm.pid);
+ if (retval)
+ goto close_fail;
+
+ /*
+ * ... and set the coredump information so userspace
+ * has it available after connect()...
+ */
+ pidfs_coredump(&cprm);
+
+ retval = kernel_connect(socket, (struct sockaddr *)(&addr),
+ addr_len, O_NONBLOCK | SOCK_COREDUMP);
+
+ /*
+ * ... Make sure to only put our reference after connect() took
+ * its own reference keeping the pidfs entry alive ...
+ */
+ pidfs_put_pid(cprm.pid);
+
+ if (retval) {
+ if (retval == -EAGAIN)
+ coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
+ else
+ coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
+ goto close_fail;
+ }
+
+ /* ... and validate that @sk_peer_pid matches @cprm.pid. */
+ if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid))
+ goto close_fail;
+
+ cprm.limit = RLIM_INFINITY;
+ cprm.file = no_free_ptr(file);
+#else
+ coredump_report_failure("Core dump socket support %s disabled", cn.corename);
+ goto close_fail;
+#endif
+ break;
+ }
+ default:
+ WARN_ON_ONCE(true);
+ goto close_fail;
}
/* get us an unshared descriptor table; almost always a no-op */
@@ -770,13 +961,49 @@ void do_coredump(const kernel_siginfo_t *siginfo)
file_end_write(cprm.file);
free_vma_snapshot(&cprm);
}
- if (ispipe && core_pipe_limit)
- wait_for_dump_helpers(cprm.file);
+
+#ifdef CONFIG_UNIX
+ /* Let userspace know we're done processing the coredump. */
+ if (sock_from_file(cprm.file))
+ kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR);
+#endif
+
+ /*
+ * When core_pipe_limit is set we wait for the coredump server
+ * or usermodehelper to finish before exiting so it can e.g.,
+ * inspect /proc/<pid>.
+ */
+ if (core_pipe_limit) {
+ switch (cn.core_type) {
+ case COREDUMP_PIPE:
+ wait_for_dump_helpers(cprm.file);
+ break;
+#ifdef CONFIG_UNIX
+ case COREDUMP_SOCK: {
+ ssize_t n;
+
+ /*
+ * We use a simple read to wait for the coredump
+ * processing to finish. Either the socket is
+ * closed or we get sent unexpected data. In
+ * both cases, we're done.
+ */
+ n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL);
+ if (n != 0)
+ coredump_report_failure("Unexpected data on coredump socket");
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+ }
+
close_fail:
if (cprm.file)
filp_close(cprm.file, NULL);
fail_dropcount:
- if (ispipe)
+ if (cn.core_type == COREDUMP_PIPE)
atomic_dec(&core_dump_count);
fail_unlock:
kfree(argv);
@@ -799,10 +1026,9 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
struct file *file = cprm->file;
loff_t pos = file->f_pos;
ssize_t n;
+
if (cprm->written + nr > cprm->limit)
return 0;
-
-
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
@@ -819,20 +1045,21 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
{
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
+
if (file->f_mode & FMODE_LSEEK) {
- if (dump_interrupted() ||
- vfs_llseek(file, nr, SEEK_CUR) < 0)
+ if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0)
return 0;
cprm->pos += nr;
return 1;
- } else {
- while (nr > PAGE_SIZE) {
- if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
- return 0;
- nr -= PAGE_SIZE;
- }
- return __dump_emit(cprm, zeroes, nr);
}
+
+ while (nr > PAGE_SIZE) {
+ if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
+ return 0;
+ nr -= PAGE_SIZE;
+ }
+
+ return __dump_emit(cprm, zeroes, nr);
}
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
@@ -1001,7 +1228,7 @@ EXPORT_SYMBOL(dump_align);
void validate_coredump_safety(void)
{
if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
+ core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
"pipe handler or fully qualified core dump path required. "
@@ -1009,18 +1236,55 @@ void validate_coredump_safety(void)
}
}
+static inline bool check_coredump_socket(void)
+{
+ if (core_pattern[0] != '@')
+ return true;
+
+ /*
+ * Coredump socket must be located in the initial mount
+ * namespace. Don't give the impression that anything else is
+ * supported right now.
+ */
+ if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
+ return false;
+
+ /* Must be an absolute path. */
+ if (*(core_pattern + 1) != '/')
+ return false;
+
+ return true;
+}
+
static int proc_dostring_coredump(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
+ int error;
+ ssize_t retval;
+ char old_core_pattern[CORENAME_MAX_SIZE];
+
+ retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
+
+ error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (error)
+ return error;
+ if (!check_coredump_socket()) {
+ strscpy(core_pattern, old_core_pattern, retval + 1);
+ return -EINVAL;
+ }
- if (!error)
- validate_coredump_safety();
+ validate_coredump_safety();
return error;
}
static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
+static char core_modes[] = {
+ "file\npipe"
+#ifdef CONFIG_UNIX
+ "\nsocket"
+#endif
+};
static const struct ctl_table coredump_sysctls[] = {
{
@@ -1064,6 +1328,13 @@ static const struct ctl_table coredump_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "core_modes",
+ .data = core_modes,
+ .maxlen = sizeof(core_modes) - 1,
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
};
static int __init init_fs_coredump_sysctls(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index bd5aa136153a..03d58b2d4fa3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -74,10 +74,11 @@
* arbitrary, since it's serialized on rename_lock
*/
static int sysctl_vfs_cache_pressure __read_mostly = 100;
+static int sysctl_vfs_cache_pressure_denom __read_mostly = 100;
unsigned long vfs_pressure_ratio(unsigned long val)
{
- return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+ return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom);
}
EXPORT_SYMBOL_GPL(vfs_pressure_ratio);
@@ -225,6 +226,14 @@ static const struct ctl_table vm_dcache_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "vfs_cache_pressure_denom",
+ .data = &sysctl_vfs_cache_pressure_denom,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure_denom),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_HUNDRED,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -2412,7 +2421,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
}
return d_lookup(dir, name);
}
-EXPORT_SYMBOL(d_hash_and_lookup);
/*
* When a file is deleted, we have two options:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 75715d8877ee..30c4944e1862 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -346,7 +346,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = lookup_positive_unlocked(name, parent, strlen(name));
+ dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
if (IS_ERR(dentry))
return NULL;
return dentry;
@@ -388,7 +388,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (unlikely(IS_DEADDIR(d_inode(parent))))
dentry = ERR_PTR(-ENOENT);
else
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
if (d_is_dir(dentry))
pr_err("Directory '%s' with parent '%s' already present!\n",
@@ -872,7 +872,7 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
}
if (strcmp(old_name.name.name, new_name) == 0)
goto out;
- target = lookup_one_len(new_name, parent, strlen(new_name));
+ target = lookup_noperm(&QSTR(new_name), parent);
if (IS_ERR(target)) {
error = PTR_ERR(target);
goto out;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 51a5c54eb740..493d7f194956 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -394,8 +394,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
char *encrypted_and_encoded_name = NULL;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct dentry *lower_dir_dentry, *lower_dentry;
- const char *name = ecryptfs_dentry->d_name.name;
- size_t len = ecryptfs_dentry->d_name.len;
+ struct qstr qname = QSTR_INIT(ecryptfs_dentry->d_name.name,
+ ecryptfs_dentry->d_name.len);
struct dentry *res;
int rc = 0;
@@ -404,23 +404,25 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+ size_t len = qname.len;
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &len,
- mount_crypt_stat, name, len);
+ mount_crypt_stat, qname.name, len);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
return ERR_PTR(rc);
}
- name = encrypted_and_encoded_name;
+ qname.name = encrypted_and_encoded_name;
+ qname.len = len;
}
- lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len);
+ lower_dentry = lookup_noperm_unlocked(&qname, lower_dir_dentry);
if (IS_ERR(lower_dentry)) {
- ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+ ecryptfs_printk(KERN_DEBUG, "%s: lookup_noperm() returned "
"[%ld] on lower_dentry = [%s]\n", __func__,
PTR_ERR(lower_dentry),
- name);
+ qname.name);
res = ERR_CAST(lower_dentry);
} else {
res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry);
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index ac6a1dd0a6a5..f913b6824289 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -17,7 +17,6 @@ struct efivarfs_fs_info {
struct efivarfs_mount_opts mount_opts;
struct super_block *sb;
struct notifier_block nb;
- struct notifier_block pm_nb;
};
struct efi_variable {
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 0486e9b68bc6..c900d98bf494 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -18,8 +18,10 @@
#include <linux/statfs.h>
#include <linux/notifier.h>
#include <linux/printk.h>
+#include <linux/namei.h>
#include "internal.h"
+#include "../internal.h"
static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -119,12 +121,18 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+
+static int efivarfs_freeze_fs(struct super_block *sb);
+static int efivarfs_unfreeze_fs(struct super_block *sb);
+
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
.drop_inode = generic_delete_inode,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
+ .freeze_fs = efivarfs_freeze_fs,
+ .unfreeze_fs = efivarfs_unfreeze_fs,
};
/*
@@ -204,7 +212,6 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
char *name = efivar_get_utf8name(variable_name, vendor);
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
if (!name)
/*
@@ -217,9 +224,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
*/
return true;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
kfree(name);
if (!IS_ERR_OR_NULL(dentry))
dput(dentry);
@@ -367,8 +372,6 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
- register_pm_notifier(&sfi->pm_nb);
-
return efivar_init(efivarfs_callback, sb, true);
}
@@ -393,55 +396,12 @@ static const struct fs_context_operations efivarfs_context_ops = {
.reconfigure = efivarfs_reconfigure,
};
-struct efivarfs_ctx {
- struct dir_context ctx;
- struct super_block *sb;
- struct dentry *dentry;
-};
-
-static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
- loff_t offset, u64 ino, unsigned mode)
-{
- unsigned long size;
- struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx);
- struct qstr qstr = { .name = name, .len = len };
- struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr);
- struct inode *inode;
- struct efivar_entry *entry;
- int err;
-
- if (IS_ERR_OR_NULL(dentry))
- return true;
-
- inode = d_inode(dentry);
- entry = efivar_entry(inode);
-
- err = efivar_entry_size(entry, &size);
- size += sizeof(__u32); /* attributes */
- if (err)
- size = 0;
-
- inode_lock_nested(inode, I_MUTEX_CHILD);
- i_size_write(inode, size);
- inode_unlock(inode);
-
- if (!size) {
- ectx->dentry = dentry;
- return false;
- }
-
- dput(dentry);
-
- return true;
-}
-
static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
unsigned long name_size, void *data)
{
char *name;
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
int err;
if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
@@ -451,9 +411,7 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
if (!name)
return -ENOMEM;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out;
@@ -474,111 +432,59 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
return err;
}
-static void efivarfs_deactivate_super_work(struct work_struct *work)
-{
- struct super_block *s = container_of(work, struct super_block,
- destroy_work);
- /*
- * note: here s->destroy_work is free for reuse (which
- * will happen in deactivate_super)
- */
- deactivate_super(s);
-}
-
static struct file_system_type efivarfs_type;
-static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
- void *ptr)
+static int efivarfs_freeze_fs(struct super_block *sb)
{
- struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
- pm_nb);
- struct path path;
- struct efivarfs_ctx ectx = {
- .ctx = {
- .actor = efivarfs_actor,
- },
- .sb = sfi->sb,
- };
- struct file *file;
- struct super_block *s = sfi->sb;
- static bool rescan_done = true;
-
- if (action == PM_HIBERNATION_PREPARE) {
- rescan_done = false;
- return NOTIFY_OK;
- } else if (action != PM_POST_HIBERNATION) {
- return NOTIFY_DONE;
- }
-
- if (rescan_done)
- return NOTIFY_DONE;
-
- /* ensure single superblock is alive and pin it */
- if (!atomic_inc_not_zero(&s->s_active))
- return NOTIFY_DONE;
-
- pr_info("efivarfs: resyncing variable state\n");
-
- path.dentry = sfi->sb->s_root;
-
- /*
- * do not add SB_KERNMOUNT which a single superblock could
- * expose to userspace and which also causes MNT_INTERNAL, see
- * below
- */
- path.mnt = vfs_kern_mount(&efivarfs_type, 0,
- efivarfs_type.name, NULL);
- if (IS_ERR(path.mnt)) {
- pr_err("efivarfs: internal mount failed\n");
- /*
- * We may be the last pinner of the superblock but
- * calling efivarfs_kill_sb from within the notifier
- * here would deadlock trying to unregister it
- */
- INIT_WORK(&s->destroy_work, efivarfs_deactivate_super_work);
- schedule_work(&s->destroy_work);
- return PTR_ERR(path.mnt);
- }
-
- /* path.mnt now has pin on superblock, so this must be above one */
- atomic_dec(&s->s_active);
-
- file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
- current_cred());
- /*
- * safe even if last put because no MNT_INTERNAL means this
- * will do delayed deactivate_super and not deadlock
- */
- mntput(path.mnt);
- if (IS_ERR(file))
- return NOTIFY_DONE;
+ /* Nothing for us to do. */
+ return 0;
+}
- rescan_done = true;
+static int efivarfs_unfreeze_fs(struct super_block *sb)
+{
+ struct dentry *child = NULL;
/*
- * First loop over the directory and verify each entry exists,
- * removing it if it doesn't
+ * Unconditionally resync the variable state on a thaw request.
+ * Given the size of efivarfs it really doesn't matter to simply
+ * iterate through all of the entries and resync. Freeze/thaw
+ * requests are rare enough for that to not matter and the
+ * number of entries is pretty low too. So we really don't care.
*/
- file->f_pos = 2; /* skip . and .. */
- do {
- ectx.dentry = NULL;
- iterate_dir(file, &ectx.ctx);
- if (ectx.dentry) {
- pr_info("efivarfs: removing variable %pd\n",
- ectx.dentry);
- simple_recursive_removal(ectx.dentry, NULL);
- dput(ectx.dentry);
+ pr_info("efivarfs: resyncing variable state\n");
+ for (;;) {
+ int err;
+ unsigned long size = 0;
+ struct inode *inode;
+ struct efivar_entry *entry;
+
+ child = find_next_child(sb->s_root, child);
+ if (!child)
+ break;
+
+ inode = d_inode(child);
+ entry = efivar_entry(inode);
+
+ err = efivar_entry_size(entry, &size);
+ if (err)
+ size = 0;
+ else
+ size += sizeof(__u32);
+
+ inode_lock(inode);
+ i_size_write(inode, size);
+ inode_unlock(inode);
+
+ /* The variable doesn't exist anymore, delete it. */
+ if (!size) {
+ pr_info("efivarfs: removing variable %pd\n", child);
+ simple_recursive_removal(child, NULL);
}
- } while (ectx.dentry);
- fput(file);
-
- /*
- * then loop over variables, creating them if there's no matching
- * dentry
- */
- efivar_init(efivarfs_check_missing, sfi->sb, false);
+ }
- return NOTIFY_OK;
+ efivar_init(efivarfs_check_missing, sb, false);
+ pr_info("efivarfs: finished resyncing variable state\n");
+ return 0;
}
static int efivarfs_init_fs_context(struct fs_context *fc)
@@ -598,9 +504,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
fc->s_fs_info = sfi;
fc->ops = &efivarfs_context_ops;
- sfi->pm_nb.notifier_call = efivarfs_pm_notify;
- sfi->pm_nb.priority = 0;
-
return 0;
}
@@ -610,7 +513,6 @@ static void efivarfs_kill_sb(struct super_block *sb)
blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
kill_litter_super(sb);
- unregister_pm_notifier(&sfi->pm_nb);
kfree(sfi);
}
diff --git a/fs/exec.c b/fs/exec.c
index 8e4ea5f1e64c..cfbb2b9ee3c9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,66 +115,6 @@ bool path_noexec(const struct path *path)
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
-#ifdef CONFIG_USELIB
-/*
- * Note that a shared library must be both readable and executable due to
- * security reasons.
- *
- * Also note that we take the address to load from the file itself.
- */
-SYSCALL_DEFINE1(uselib, const char __user *, library)
-{
- struct linux_binfmt *fmt;
- struct file *file;
- struct filename *tmp = getname(library);
- int error = PTR_ERR(tmp);
- static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY,
- .acc_mode = MAY_READ | MAY_EXEC,
- .intent = LOOKUP_OPEN,
- .lookup_flags = LOOKUP_FOLLOW,
- };
-
- if (IS_ERR(tmp))
- goto out;
-
- file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
- putname(tmp);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-
- /*
- * Check do_open_execat() for an explanation.
- */
- error = -EACCES;
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
- goto exit;
-
- error = -ENOEXEC;
-
- read_lock(&binfmt_lock);
- list_for_each_entry(fmt, &formats, lh) {
- if (!fmt->load_shlib)
- continue;
- if (!try_module_get(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- error = fmt->load_shlib(file);
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (error != -ENOEXEC)
- break;
- }
- read_unlock(&binfmt_lock);
-exit:
- fput(file);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_MMU
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 128dd092916b..cdefea17986a 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -143,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent);
if (IS_ERR(tmp)) {
dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -284,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
@@ -549,8 +550,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
}
inode_lock(target_dir->d_inode);
- nresult = lookup_one(mnt_idmap(mnt), nbuf,
- target_dir, strlen(nbuf));
+ nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir);
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2b8f9239bede..dd0ba0532e01 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -2271,12 +2271,12 @@ out_drop_write:
if (err)
return err;
- err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ err = freeze_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
if (err)
return err;
if (f2fs_readonly(sbi->sb)) {
- err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ err = thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
if (err)
return err;
return -EROFS;
@@ -2333,6 +2333,6 @@ recover_out:
out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
- thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
return err;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index c04ed94cdc4b..138114d64307 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(get_max_files);
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
- files_stat.nr_files = get_nr_files();
+ files_stat.nr_files = percpu_counter_sum_positive(&nr_files);
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 58b9067b2391..95e5256821a5 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -156,15 +156,19 @@ static int fs_index(const char __user * __name)
static int fs_name(unsigned int index, char __user * buf)
{
struct file_system_type * tmp;
- int len, res;
+ int len, res = -EINVAL;
read_lock(&file_systems_lock);
- for (tmp = file_systems; tmp; tmp = tmp->next, index--)
- if (index <= 0 && try_module_get(tmp->owner))
+ for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
+ if (index == 0) {
+ if (try_module_get(tmp->owner))
+ res = 0;
break;
+ }
+ }
read_unlock(&file_systems_lock);
- if (!tmp)
- return -EINVAL;
+ if (res)
+ return res;
/* OK, we got the reference, so we can safely block */
len = strlen(tmp->name) + 1;
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 582d33e81117..666e61753aed 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -222,7 +222,7 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
char *value = strchr(key, '=');
if (value) {
- if (value == key)
+ if (unlikely(value == key))
continue;
*value++ = 0;
v_len = strlen(value);
@@ -449,6 +449,10 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
+ case 'i':
+ printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "",
+ prefix ? ": " : "", &vaf);
+ break;
default:
printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index e635a81e17d9..c092a9f79e32 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -380,58 +380,9 @@ EXPORT_SYMBOL(fs_param_is_path);
#ifdef CONFIG_VALIDATE_FS_PARSER
/**
- * validate_constant_table - Validate a constant table
- * @tbl: The constant table to validate.
- * @tbl_size: The size of the table.
- * @low: The lowest permissible value.
- * @high: The highest permissible value.
- * @special: One special permissible value outside of the range.
- */
-bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special)
-{
- size_t i;
- bool good = true;
-
- if (tbl_size == 0) {
- pr_warn("VALIDATE C-TBL: Empty\n");
- return true;
- }
-
- for (i = 0; i < tbl_size; i++) {
- if (!tbl[i].name) {
- pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
- good = false;
- } else if (i > 0 && tbl[i - 1].name) {
- int c = strcmp(tbl[i-1].name, tbl[i].name);
-
- if (c == 0) {
- pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
- i, tbl[i].name);
- good = false;
- }
- if (c > 0) {
- pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
- i, tbl[i-1].name, tbl[i].name);
- good = false;
- }
- }
-
- if (tbl[i].value != special &&
- (tbl[i].value < low || tbl[i].value > high)) {
- pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
- i, tbl[i].name, tbl[i].value, low, high);
- good = false;
- }
- }
-
- return good;
-}
-
-/**
- * fs_validate_description - Validate a parameter description
- * @name: The parameter name to search for.
- * @desc: The parameter description to validate.
+ * fs_validate_description - Validate a parameter specification array
+ * @name: Owner name of the parameter specification array
+ * @desc: The parameter specification array to validate.
*/
bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 83ac192e7fdd..33b82529cb6e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1676,7 +1676,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
goto out_err;
}
- set_delayed_call(callback, page_put_link, &folio->page);
+ set_delayed_call(callback, page_put_link, folio);
return folio_address(folio);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..edcd6f18a8a8 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx,
fuse_add_dirent_to_cache(file, dirent, ctx->pos);
return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
- dirent->type);
+ dirent->type | FILLDIR_FLAG_NOINTR);
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
@@ -419,7 +419,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
if (ff->readdir.pos == ctx->pos) {
res = FOUND_SOME;
if (!dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type))
+ dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR))
return FOUND_ALL;
ctx->pos = dirent->off;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e83d293c3614..7c1014ba7ac7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -226,28 +226,22 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const struct gfs2_sb *str)
static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
{
- struct super_block *sb = sdp->sd_vfs;
- struct page *page;
- struct bio_vec bvec;
- struct bio bio;
+ struct gfs2_sb *sb;
int err;
- page = alloc_page(GFP_KERNEL);
- if (unlikely(!page))
+ sb = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (unlikely(!sb))
return -ENOMEM;
-
- bio_init(&bio, sb->s_bdev, &bvec, 1, REQ_OP_READ | REQ_META);
- bio.bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- err = submit_bio_wait(&bio);
+ err = bdev_rw_virt(sdp->sd_vfs->s_bdev,
+ sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE,
+ REQ_OP_READ | REQ_META);
if (err) {
pr_warn("error %d reading superblock\n", err);
- __free_page(page);
+ kfree(sb);
return err;
}
- gfs2_sb_in(sdp, page_address(page));
- __free_page(page);
+ gfs2_sb_in(sdp, sb);
+ kfree(sb);
return gfs2_check_sb(sdp, silent);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 44e5658b896c..436cf168a9f5 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -674,7 +674,7 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
return sdp->sd_log_error;
}
-static int gfs2_do_thaw(struct gfs2_sbd *sdp)
+static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who, const void *freeze_owner)
{
struct super_block *sb = sdp->sd_vfs;
int error;
@@ -682,7 +682,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp)
error = gfs2_freeze_lock_shared(sdp);
if (error)
goto fail;
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = thaw_super(sb, who, freeze_owner);
if (!error)
return 0;
@@ -703,14 +703,14 @@ void gfs2_freeze_func(struct work_struct *work)
if (test_bit(SDF_FROZEN, &sdp->sd_flags))
goto freeze_failed;
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
if (error)
goto freeze_failed;
gfs2_freeze_unlock(sdp);
set_bit(SDF_FROZEN, &sdp->sd_flags);
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE, NULL);
if (error)
goto out;
@@ -728,10 +728,13 @@ out:
/**
* gfs2_freeze_super - prevent further writes to the filesystem
* @sb: the VFS structure for the filesystem
+ * @who: freeze flags
+ * @freeze_owner: owner of the freeze
*
*/
-static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
+static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -744,7 +747,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
for (;;) {
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sb, who, freeze_owner);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
@@ -758,7 +761,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
break;
}
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, who, freeze_owner);
if (error)
goto out;
@@ -796,10 +799,13 @@ static int gfs2_freeze_fs(struct super_block *sb)
/**
* gfs2_thaw_super - reallow writes to the filesystem
* @sb: the VFS structure for the filesystem
+ * @who: freeze flags
+ * @freeze_owner: owner of the freeze
*
*/
-static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
+static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -814,7 +820,7 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
atomic_inc(&sb->s_active);
gfs2_freeze_unlock(sdp);
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, who, freeze_owner);
if (!error) {
clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ecc699f8d9fc..748125653d6c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -174,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
switch (n) {
case 0:
- error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
+ error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL);
break;
case 1:
- error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL);
break;
default:
return -EINVAL;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 74801911bc1c..30cf4fe78b3d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -48,47 +48,19 @@ struct hfsplus_wd {
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, blk_opf_t opf)
{
- const enum req_op op = opf & REQ_OP_MASK;
- struct bio *bio;
- int ret = 0;
- u64 io_size;
- loff_t start;
- int offset;
+ u64 io_size = hfsplus_min_io_size(sb);
+ loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
+ int offset = start & (io_size - 1);
+
+ if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data)
+ *data = (u8 *)buf + offset;
/*
- * Align sector to hardware sector size and find offset. We
- * assume that io_size is a power of two, which _should_
- * be true.
+ * Align sector to hardware sector size and find offset. We assume that
+ * io_size is a power of two, which _should_ be true.
*/
- io_size = hfsplus_min_io_size(sb);
- start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
- offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
-
- bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO);
- bio->bi_iter.bi_sector = sector;
-
- if (op != REQ_OP_WRITE && data)
- *data = (u8 *)buf + offset;
-
- while (io_size > 0) {
- unsigned int page_offset = offset_in_page(buf);
- unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
- io_size);
-
- ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
- if (ret != len) {
- ret = -EIO;
- goto out;
- }
- io_size -= len;
- buf = (u8 *)buf + len;
- }
-
- ret = submit_bio_wait(bio);
-out:
- bio_put(bio);
- return ret < 0 ? ret : 0;
+ return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf);
}
static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
diff --git a/fs/internal.h b/fs/internal.h
index b9b3e29a73fd..393f6c5c24f6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -66,6 +66,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
int vfs_tmpfile(struct mnt_idmap *idmap,
const struct path *parentpath,
struct file *file, umode_t mode);
+struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
/*
* namespace.c
@@ -343,3 +344,9 @@ static inline bool path_mounted(const struct path *path)
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
+struct dentry *find_next_child(struct dentry *parent, struct dentry *prev);
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index c91fd2b46a77..69107a245b4c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -396,8 +396,8 @@ static int ioctl_fsfreeze(struct file *filp)
/* Freeze */
if (sb->s_op->freeze_super)
- return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int ioctl_fsthaw(struct file *filp)
@@ -409,8 +409,8 @@ static int ioctl_fsthaw(struct file *filp)
/* Thaw */
if (sb->s_op->thaw_super)
- return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int ioctl_file_dedupe_range(struct file *file,
@@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+ if (S_ISDIR(inode->i_mode) ||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
return copy_to_user(argp, &res, sizeof(res)) ?
@@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_file_dedupe_range(filp, argp);
case FIONREAD:
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
return vfs_ioctl(filp, cmd, arg);
return put_user(i_size_read(inode) - filp->f_pos,
@@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_get_fs_sysfs_path(filp, argp);
default:
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
return file_ioctl(filp, cmd, argp);
break;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 5b08bd417b28..233abf598f65 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -679,11 +679,12 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
return submit_bio_wait(&bio);
}
-static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
- size_t len, struct folio *folio)
+static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
+ struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_folio_state *ifs;
+ loff_t pos = iter->pos;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
@@ -741,10 +742,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
}
-static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
- size_t len)
+static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len)
{
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ loff_t pos = iter->pos;
+
+ if (!mapping_large_folio_support(iter->inode->i_mapping))
+ len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
if (folio_ops && folio_ops->get_folio)
return folio_ops->get_folio(iter, pos, len);
@@ -752,10 +756,11 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
return iomap_get_folio(iter, pos, len);
}
-static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
+static void __iomap_put_folio(struct iomap_iter *iter, size_t ret,
struct folio *folio)
{
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ loff_t pos = iter->pos;
if (folio_ops && folio_ops->put_folio) {
folio_ops->put_folio(iter->inode, pos, ret, folio);
@@ -765,6 +770,22 @@ static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
}
}
+/* trim pos and bytes to within a given folio */
+static loff_t iomap_trim_folio_range(struct iomap_iter *iter,
+ struct folio *folio, size_t *offset, u64 *bytes)
+{
+ loff_t pos = iter->pos;
+ size_t fsize = folio_size(folio);
+
+ WARN_ON_ONCE(pos < folio_pos(folio));
+ WARN_ON_ONCE(pos >= folio_pos(folio) + fsize);
+
+ *offset = offset_in_folio(folio, pos);
+ *bytes = min(*bytes, fsize - *offset);
+
+ return pos;
+}
+
static int iomap_write_begin_inline(const struct iomap_iter *iter,
struct folio *folio)
{
@@ -774,14 +795,22 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
}
-static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- size_t len, struct folio **foliop)
+/*
+ * Grab and prepare a folio for write based on iter state. Returns the folio,
+ * offset, and length. Callers can optionally pass a max length *plen,
+ * otherwise init to zero.
+ */
+static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
+ size_t *poffset, u64 *plen)
{
const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
struct folio *folio;
int status = 0;
+ len = min_not_zero(len, *plen);
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -789,10 +818,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
if (fatal_signal_pending(current))
return -EINTR;
- if (!mapping_large_folio_support(iter->inode->i_mapping))
- len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
-
- folio = __iomap_get_folio(iter, pos, len);
+ folio = __iomap_get_folio(iter, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -816,24 +842,24 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
}
}
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ pos = iomap_trim_folio_range(iter, folio, poffset, &len);
if (srcmap->type == IOMAP_INLINE)
status = iomap_write_begin_inline(iter, folio);
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(iter, pos, len, folio);
+ status = __iomap_write_begin(iter, len, folio);
if (unlikely(status))
goto out_unlock;
*foliop = folio;
+ *plen = len;
return 0;
out_unlock:
- __iomap_put_folio(iter, pos, 0, folio);
+ __iomap_put_folio(iter, 0, folio);
return status;
}
@@ -883,10 +909,11 @@ static void iomap_write_end_inline(const struct iomap_iter *iter,
* Returns true if all copied bytes have been written to the pagecache,
* otherwise return false.
*/
-static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
- size_t copied, struct folio *folio)
+static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
+ struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
@@ -917,14 +944,14 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
struct folio *folio;
loff_t old_size;
size_t offset; /* Offset into folio */
- size_t bytes; /* Bytes to write to folio */
+ u64 bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
u64 written; /* Bytes have been written */
- loff_t pos = iter->pos;
+ loff_t pos;
bytes = iov_iter_count(i);
retry:
- offset = pos & (chunk - 1);
+ offset = iter->pos & (chunk - 1);
bytes = min(chunk - offset, bytes);
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
@@ -949,23 +976,21 @@ retry:
break;
}
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ status = iomap_write_begin(iter, &folio, &offset, &bytes);
if (unlikely(status)) {
- iomap_write_failed(iter->inode, pos, bytes);
+ iomap_write_failed(iter->inode, iter->pos, bytes);
break;
}
if (iter->iomap.flags & IOMAP_F_STALE)
break;
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
+ pos = iter->pos;
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- written = iomap_write_end(iter, pos, bytes, copied, folio) ?
+ written = iomap_write_end(iter, bytes, copied, folio) ?
copied : 0;
/*
@@ -980,7 +1005,7 @@ retry:
i_size_write(iter->inode, pos + written);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- __iomap_put_folio(iter, pos, written, folio);
+ __iomap_put_folio(iter, written, folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
@@ -1276,22 +1301,17 @@ static int iomap_unshare_iter(struct iomap_iter *iter)
do {
struct folio *folio;
size_t offset;
- loff_t pos = iter->pos;
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ status = iomap_write_begin(iter, &folio, &offset, &bytes);
if (unlikely(status))
return status;
if (iomap->flags & IOMAP_F_STALE)
break;
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
-
- ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
+ ret = iomap_write_end(iter, bytes, bytes, folio);
+ __iomap_put_folio(iter, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1351,11 +1371,10 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
do {
struct folio *folio;
size_t offset;
- loff_t pos = iter->pos;
bool ret;
bytes = min_t(u64, SIZE_MAX, bytes);
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ status = iomap_write_begin(iter, &folio, &offset, &bytes);
if (status)
return status;
if (iter->iomap.flags & IOMAP_F_STALE)
@@ -1363,15 +1382,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
- ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
+ ret = iomap_write_end(iter, bytes, bytes, folio);
+ __iomap_put_folio(iter, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 9eab2c8ac3c5..455cc6f90be0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -99,7 +99,11 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }, \
- { IOMAP_ATOMIC, "ATOMIC" }
+ { IOMAP_OVERWRITE_ONLY, "OVERWRITE_ONLY" }, \
+ { IOMAP_UNSHARE, "UNSHARE" }, \
+ { IOMAP_DAX, "DAX" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }, \
+ { IOMAP_DONTCACHE, "DONTCACHE" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
@@ -107,7 +111,14 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_F_SHARED, "SHARED" }, \
{ IOMAP_F_MERGED, "MERGED" }, \
{ IOMAP_F_BUFFER_HEAD, "BH" }, \
- { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }
+ { IOMAP_F_XATTR, "XATTR" }, \
+ { IOMAP_F_BOUNDARY, "BOUNDARY" }, \
+ { IOMAP_F_ANON_WRITE, "ANON_WRITE" }, \
+ { IOMAP_F_ATOMIC_BIO, "ATOMIC_BIO" }, \
+ { IOMAP_F_PRIVATE, "PRIVATE" }, \
+ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }, \
+ { IOMAP_F_STALE, "STALE" }
+
#define IOMAP_DIO_STRINGS \
{IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
@@ -138,7 +149,7 @@ DECLARE_EVENT_CLASS(iomap_class,
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
),
TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx "
- "length 0x%llx type %s flags %s",
+ "length 0x%llx type %s (0x%x) flags %s (0x%x)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -146,7 +157,9 @@ DECLARE_EVENT_CLASS(iomap_class,
__entry->offset,
__entry->length,
__print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
- __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+ __entry->type,
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS),
+ __entry->flags)
)
#define DEFINE_IOMAP_EVENT(name) \
@@ -185,7 +198,7 @@ TRACE_EVENT(iomap_writepage_map,
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
),
TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
- "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+ "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -195,7 +208,9 @@ TRACE_EVENT(iomap_writepage_map,
__entry->offset,
__entry->length,
__print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
- __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+ __entry->type,
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS),
+ __entry->flags)
);
TRACE_EVENT(iomap_iter,
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 5124e196c2bf..c1719b5778a1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,6 +62,21 @@ const struct super_operations kernfs_sops = {
.show_options = kernfs_sop_show_options,
.show_path = kernfs_sop_show_path,
+
+ /*
+ * sysfs is built on top of kernfs and sysfs provides the power
+ * management infrastructure to support suspend/hibernate by
+ * writing to various files in /sys/power/. As filesystems may
+ * be automatically frozen during suspend/hibernate implementing
+ * freeze/thaw support for kernfs generically will cause
+ * deadlocks as the suspending/hibernation initiating task will
+ * hold a VFS lock that it will then wait upon to be released.
+ * If freeze/thaw for kernfs is needed talk to the VFS.
+ */
+ .freeze_fs = NULL,
+ .unfreeze_fs = NULL,
+ .freeze_super = NULL,
+ .thaw_super = NULL,
};
static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
@@ -255,7 +270,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
dput(dentry);
return ERR_PTR(-ENOMEM);
}
- dtmp = lookup_positive_unlocked(name, dentry, strlen(name));
+ dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry);
dput(dentry);
kfree(name);
if (IS_ERR(dtmp))
diff --git a/fs/libfs.c b/fs/libfs.c
index 6393d7c49ee6..9ea0ecc325a8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -583,7 +583,7 @@ const struct file_operations simple_offset_dir_operations = {
.fsync = noop_fsync,
};
-static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
+struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL, *d;
@@ -603,6 +603,7 @@ static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev
dput(prev);
return child;
}
+EXPORT_SYMBOL(find_next_child);
void simple_recursive_removal(struct dentry *dentry,
void (*callback)(struct dentry *))
@@ -1647,10 +1648,16 @@ struct inode *alloc_anon_inode(struct super_block *s)
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
+ /*
+ * Historically anonymous inodes didn't have a type at all and
+ * userspace has come to rely on this. Internally they're just
+ * regular files but S_IFREG is masked off when reporting
+ * information to userspace.
+ */
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
simple_inode_init_ts(inode);
return inode;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index ad7844de87c3..c5fd821fd30e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -445,10 +445,9 @@ static void clean_buffers(struct folio *folio, unsigned first_unmapped)
try_to_free_buffers(folio);
}
-static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
- void *data)
+static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio,
+ struct mpage_data *mpd)
{
- struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
@@ -656,14 +655,16 @@ mpage_writepages(struct address_space *mapping,
struct mpage_data mpd = {
.get_block = get_block,
};
+ struct folio *folio = NULL;
struct blk_plug plug;
- int ret;
+ int error;
blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = mpage_write_folio(wbc, folio, &mpd);
if (mpd.bio)
mpage_bio_submit_write(mpd.bio);
blk_finish_plug(&plug);
- return ret;
+ return error;
}
EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 84a0e0b0111c..4bb889fc980b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -571,14 +571,14 @@ int inode_permission(struct mnt_idmap *idmap,
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
if (unlikely(mask & MAY_WRITE)) {
/*
* Nobody gets write access to an immutable file.
*/
- if (IS_IMMUTABLE(inode))
+ if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
/*
@@ -586,16 +586,16 @@ int inode_permission(struct mnt_idmap *idmap,
* written back improperly if their true value is unknown
* to the vfs.
*/
- if (HAS_UNMAPPED_ID(idmap, inode))
+ if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
return -EACCES;
}
retval = do_inode_permission(idmap, inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
retval = devcgroup_inode_permission(inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
return security_inode_permission(inode, mask);
@@ -1915,13 +1915,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
- if (!(nd->flags & LOOKUP_RCU)) {
+ if (unlikely(atime_needs_update(&last->link, inode))) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (!try_to_unlazy(nd))
+ return ERR_PTR(-ECHILD);
+ }
touch_atime(&last->link);
cond_resched();
- } else if (atime_needs_update(&last->link, inode)) {
- if (!try_to_unlazy(nd))
- return ERR_PTR(-ECHILD);
- touch_atime(&last->link);
}
error = security_inode_follow_link(link->dentry, inode,
@@ -2434,9 +2434,12 @@ static int link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
- while (*name=='/')
- name++;
- if (!*name) {
+ if (*name == '/') {
+ do {
+ name++;
+ } while (unlikely(*name == '/'));
+ }
+ if (unlikely(!*name)) {
nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
return 0;
}
@@ -2449,7 +2452,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
idmap = mnt_idmap(nd->path.mnt);
err = may_lookup(idmap, nd);
- if (err)
+ if (unlikely(err))
return err;
nd->last.name = name;
@@ -2869,13 +2872,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(vfs_path_lookup);
-static int lookup_one_common(struct mnt_idmap *idmap,
- const char *name, struct dentry *base, int len,
- struct qstr *this)
+static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
{
- this->name = name;
- this->len = len;
- this->hash = full_name_hash(base, name, len);
+ const char *name = qname->name;
+ u32 len = qname->len;
+
+ qname->hash = full_name_hash(base, name, len);
if (!len)
return -EACCES;
@@ -2892,139 +2894,135 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* to use its own hash..
*/
if (base->d_flags & DCACHE_OP_HASH) {
- int err = base->d_op->d_hash(base, this);
+ int err = base->d_op->d_hash(base, qname);
if (err < 0)
return err;
}
+ return 0;
+}
+static int lookup_one_common(struct mnt_idmap *idmap,
+ struct qstr *qname, struct dentry *base)
+{
+ int err;
+ err = lookup_noperm_common(qname, base);
+ if (err < 0)
+ return err;
return inode_permission(idmap, base->d_inode, MAY_EXEC);
}
/**
- * try_lookup_one_len - filesystem helper to lookup single pathname component
- * @name: pathname component to lookup
+ * try_lookup_noperm - filesystem helper to lookup single pathname component
+ * @name: qstr storing pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Look up a dentry by name in the dcache, returning NULL if it does not
* currently exist. The function does not try to create a dentry.
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
*
* No locks need be held - only a counted reference to @base is needed.
*
*/
-struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
{
- struct qstr this;
int err;
- err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
+ err = lookup_noperm_common(name, base);
if (err)
return ERR_PTR(err);
- return lookup_dcache(&this, base, 0);
+ return lookup_dcache(name, base, 0);
}
-EXPORT_SYMBOL(try_lookup_one_len);
+EXPORT_SYMBOL(try_lookup_noperm);
/**
- * lookup_one_len - filesystem helper to lookup single pathname component
- * @name: pathname component to lookup
+ * lookup_noperm - filesystem helper to lookup single pathname component
+ * @name: qstr storing pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
*
* The caller must hold base->i_mutex.
*/
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
{
struct dentry *dentry;
- struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
+ err = lookup_noperm_common(name, base);
if (err)
return ERR_PTR(err);
- dentry = lookup_dcache(&this, base, 0);
- return dentry ? dentry : __lookup_slow(&this, base, 0);
+ dentry = lookup_dcache(name, base, 0);
+ return dentry ? dentry : __lookup_slow(name, base, 0);
}
-EXPORT_SYMBOL(lookup_one_len);
+EXPORT_SYMBOL(lookup_noperm);
/**
- * lookup_one - filesystem helper to lookup single pathname component
+ * lookup_one - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr holding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
* The caller must hold base->i_mutex.
*/
-struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
+ struct dentry *base)
{
struct dentry *dentry;
- struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_common(idmap, name, base, len, &this);
+ err = lookup_one_common(idmap, name, base);
if (err)
return ERR_PTR(err);
- dentry = lookup_dcache(&this, base, 0);
- return dentry ? dentry : __lookup_slow(&this, base, 0);
+ dentry = lookup_dcache(name, base, 0);
+ return dentry ? dentry : __lookup_slow(name, base, 0);
}
EXPORT_SYMBOL(lookup_one);
/**
- * lookup_one_unlocked - filesystem helper to lookup single pathname component
+ * lookup_one_unlocked - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr olding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
- * Unlike lookup_one_len, it should be called without the parent
- * i_mutex held, and will take the i_mutex itself if necessary.
+ * Unlike lookup_one, it should be called without the parent
+ * i_rwsem held, and will take the i_rwsem itself if necessary.
*/
-struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
- const char *name, struct dentry *base,
- int len)
+struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
+ struct dentry *base)
{
- struct qstr this;
int err;
struct dentry *ret;
- err = lookup_one_common(idmap, name, base, len, &this);
+ err = lookup_one_common(idmap, name, base);
if (err)
return ERR_PTR(err);
- ret = lookup_dcache(&this, base, 0);
+ ret = lookup_dcache(name, base, 0);
if (!ret)
- ret = lookup_slow(&this, base, 0);
+ ret = lookup_slow(name, base, 0);
return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);
/**
- * lookup_one_positive_unlocked - filesystem helper to lookup single
- * pathname component
+ * lookup_one_positive_unlocked - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr holding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
* known positive or ERR_PTR(). This is what most of the users want.
@@ -3033,16 +3031,15 @@ EXPORT_SYMBOL(lookup_one_unlocked);
* time, so callers of lookup_one_unlocked() need to be very careful; pinned
* positives have >d_inode stable, so this one avoids such problems.
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
- * The helper should be called without i_mutex held.
+ * The helper should be called without i_rwsem held.
*/
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
- const char *name,
- struct dentry *base, int len)
+ struct qstr *name,
+ struct dentry *base)
{
- struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
+ struct dentry *ret = lookup_one_unlocked(idmap, name, base);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
dput(ret);
@@ -3053,38 +3050,48 @@ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
EXPORT_SYMBOL(lookup_one_positive_unlocked);
/**
- * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * lookup_noperm_unlocked - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
*
- * Unlike lookup_one_len, it should be called without the parent
- * i_mutex held, and will take the i_mutex itself if necessary.
+ * Unlike lookup_noperm, it should be called without the parent
+ * i_rwsem held, and will take the i_rwsem itself if necessary.
*/
-struct dentry *lookup_one_len_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
{
- return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
+ struct dentry *ret;
+
+ ret = try_lookup_noperm(name, base);
+ if (!ret)
+ ret = lookup_slow(name, base, 0);
+ return ret;
}
-EXPORT_SYMBOL(lookup_one_len_unlocked);
+EXPORT_SYMBOL(lookup_noperm_unlocked);
/*
- * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
+ * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
* on negatives. Returns known positive or ERR_PTR(); that's what
* most of the users want. Note that pinned negative with unlocked parent
- * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
+ * _can_ become positive at any time, so callers of lookup_noperm_unlocked()
* need to be very careful; pinned positives have ->d_inode stable, so
* this one avoids such problems.
*/
-struct dentry *lookup_positive_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
+ struct dentry *base)
{
- return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
+ struct dentry *ret;
+
+ ret = lookup_noperm_unlocked(name, base);
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
}
-EXPORT_SYMBOL(lookup_positive_unlocked);
+EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
@@ -5403,25 +5410,25 @@ EXPORT_SYMBOL(vfs_get_link);
static char *__page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
- struct page *page;
+ struct folio *folio;
struct address_space *mapping = inode->i_mapping;
if (!dentry) {
- page = find_get_page(mapping, 0);
- if (!page)
+ folio = filemap_get_folio(mapping, 0);
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
+ folio = read_mapping_folio(mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- set_delayed_call(callback, page_put_link, page);
+ set_delayed_call(callback, page_put_link, folio);
BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
- return page_address(page);
+ return folio_address(folio);
}
const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
@@ -5431,6 +5438,17 @@ const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
}
EXPORT_SYMBOL_GPL(page_get_link_raw);
+/**
+ * page_get_link() - An implementation of the get_link inode_operation.
+ * @dentry: The directory entry which is the symlink.
+ * @inode: The inode for the symlink.
+ * @callback: Used to drop the reference to the symlink.
+ *
+ * Filesystems which store their symlinks in the page cache should use
+ * this to implement the get_link() member of their inode_operations.
+ *
+ * Return: A pointer to the NUL-terminated symlink.
+ */
const char *page_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
@@ -5440,12 +5458,25 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode,
nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
-
EXPORT_SYMBOL(page_get_link);
+/**
+ * page_put_link() - Drop the reference to the symlink.
+ * @arg: The folio which contains the symlink.
+ *
+ * This is used internally by page_get_link(). It is exported for use
+ * by filesystems which need to implement a variant of page_get_link()
+ * themselves. Despite the apparent symmetry, filesystems which use
+ * page_get_link() do not need to call page_put_link().
+ *
+ * The argument, while it has a void pointer type, must be a pointer to
+ * the folio which was retrieved from the page cache. The delayed_call
+ * infrastructure is used to drop the reference count once the caller
+ * is done with the symlink.
+ */
void page_put_link(void *arg)
{
- put_page(arg);
+ folio_put(arg);
}
EXPORT_SYMBOL(page_put_link);
diff --git a/fs/namespace.c b/fs/namespace.c
index 1b466c54a357..552ad7f4d18b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name)
if (err)
goto out_free_cache;
- if (name) {
+ if (name)
mnt->mnt_devname = kstrdup_const(name,
GFP_KERNEL_ACCOUNT);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ else
+ mnt->mnt_devname = "none";
+ if (!mnt->mnt_devname)
+ goto out_free_id;
#ifdef CONFIG_SMP
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
@@ -1264,7 +1265,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (!fc->root)
return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(fc->source ?: "none");
+ mnt = alloc_vfsmnt(fc->source);
if (!mnt)
return ERR_PTR(-ENOMEM);
@@ -5491,7 +5492,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
seq->buf[seq->count] = '\0';
seq->count = start;
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
- } else if (r->mnt_devname) {
+ } else {
seq_puts(seq, r->mnt_devname);
}
return 0;
@@ -5804,7 +5805,9 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
STATMOUNT_SB_SOURCE | \
STATMOUNT_OPT_ARRAY | \
STATMOUNT_OPT_SEC_ARRAY | \
- STATMOUNT_SUPPORTED_MASK)
+ STATMOUNT_SUPPORTED_MASK | \
+ STATMOUNT_MNT_UIDMAP | \
+ STATMOUNT_MNT_GIDMAP)
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
@@ -5839,13 +5842,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
- s->idmap = mnt_idmap(s->mnt);
- if (s->mask & STATMOUNT_SB_BASIC)
- statmount_sb_basic(s);
+ /*
+ * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
+ * can change concurrently as we only hold the read-side of the
+ * namespace semaphore and mount properties may change with only
+ * the mount lock held.
+ *
+ * We could sample the mount lock sequence counter to detect
+ * those changes and retry. But it's not worth it. Worst that
+ * happens is that the mnt->mnt_idmap pointer is already changed
+ * while mnt->mnt_flags isn't or vica versa. So what.
+ *
+ * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
+ * via READ_ONCE()/WRITE_ONCE() and guard against theoretical
+ * torn read/write. That's all we care about right now.
+ */
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_MNT_BASIC)
statmount_mnt_basic(s);
+ if (s->mask & STATMOUNT_SB_BASIC)
+ statmount_sb_basic(s);
+
if (s->mask & STATMOUNT_PROPAGATE_FROM)
statmount_propagate_from(s);
@@ -6157,6 +6176,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
+ /*
+ * We only need to guard against mount topology changes as
+ * listmount() doesn't care about any mount properties.
+ */
scoped_guard(rwsem_read, &namespace_sem)
ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 02c916a55020..6d63b958c4bb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1105,6 +1105,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
server->namelen = NFS2_MAXNAMLEN;
}
+ /* Linux 'subtree_check' borkenness mandates this setting */
+ server->fh_expire_type = NFS_FH_VOL_RENAME;
if (!(fattr->valid & NFS_ATTR_FATTR)) {
error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
@@ -1200,6 +1202,10 @@ void nfs_clients_init(struct net *net)
#if IS_ENABLED(CONFIG_NFS_V4)
idr_init(&nn->cb_ident_idr);
#endif
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ INIT_LIST_HEAD(&nn->nfs4_data_server_cache);
+ spin_lock_init(&nn->nfs4_data_server_lock);
+#endif
spin_lock_init(&nn->nfs_client_lock);
nn->boot_time = ktime_get_real();
memset(&nn->rpcstats, 0, sizeof(nn->rpcstats));
@@ -1216,6 +1222,9 @@ void nfs_clients_exit(struct net *net)
nfs_cleanup_cb_ident_idr(net);
WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache));
+#endif
}
#ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bd23fc736b39..d0e0b435a843 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2676,6 +2676,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
unblock_revalidate(new_dentry);
}
+static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ struct nfs_server *server = NFS_SB(old_dentry->d_sb);
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ return false;
+ if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE)
+ return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN);
+ return true;
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2763,7 +2775,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
- if (S_ISREG(old_inode->i_mode))
+ if (S_ISREG(old_inode->i_mode) &&
+ nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry))
nfs_sync_inode(old_inode);
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f32f8d7c9122..48d89716193a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -757,7 +757,6 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
struct nfs_commit_info cinfo;
- struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct inode *inode = dreq->inode;
int flags = NFS_ODIRECT_DONE;
@@ -786,6 +785,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
spin_unlock(&inode->i_lock);
while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req;
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4fa304fa5bc4..29d9234d5c08 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct page *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
+ struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
scratch = alloc_page(gfp_flags);
@@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_free_deviceid;
}
- dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!dsaddr->ds_list[i])
goto out_err_drain_dsaddrs;
trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 61ad269c825f..e6909cafab68 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1329,7 +1329,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_READ,
task->tk_status);
- trace_ff_layout_read_error(hdr);
+ trace_ff_layout_read_error(hdr, task->tk_status);
}
err = ff_layout_async_handle_error(task, hdr->args.context->state,
@@ -1502,7 +1502,7 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_WRITE,
task->tk_status);
- trace_ff_layout_write_error(hdr);
+ trace_ff_layout_write_error(hdr, task->tk_status);
}
err = ff_layout_async_handle_error(task, hdr->args.context->state,
@@ -1551,7 +1551,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
data->args.offset, data->args.count,
&data->res.op_status, OP_COMMIT,
task->tk_status);
- trace_ff_layout_commit_error(data);
+ trace_ff_layout_commit_error(data, task->tk_status);
}
err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e58bedfb1dcc..4a304cf17c4b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
struct nfs4_ff_ds_version *ds_versions = NULL;
+ struct net *net = server->nfs_client->cl_net;
u32 mp_count;
u32 version_count;
__be32 *p;
@@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
for (i = 0; i < mp_count; i++) {
/* multipath ds */
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
new_ds->ds_versions = ds_versions;
new_ds->ds_versions_cnt = version_count;
- new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!new_ds->ds)
goto out_err_drain_dsaddrs;
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 5c21caeae075..4ec952f9f47d 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -278,6 +278,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);
if (IS_ERR(new))
return NULL;
+ rcu_read_lock();
/* try to swap in the pointer */
spin_lock(&clp->cl_uuid.lock);
nf = rcu_dereference_protected(*pnf, 1);
@@ -287,7 +288,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
rcu_assign_pointer(*pnf, nf);
}
spin_unlock(&clp->cl_uuid.lock);
- rcu_read_lock();
}
nf = nfs_local_file_get(nf);
rcu_read_unlock();
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index a68b21603ea9..6ba3ea39e928 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -31,7 +31,11 @@ struct nfs_net {
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
int cb_users[NFS4_MAX_MINOR_VERSION + 1];
-#endif
+#endif /* CONFIG_NFS_V4 */
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ struct list_head nfs4_data_server_cache;
+ spinlock_t nfs4_data_server_lock;
+#endif /* CONFIG_NFS_V4_1 */
struct nfs_netns_client *nfs_client;
spinlock_t nfs_client_lock;
ktime_t boot_time;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 18d8f6529f61..a126eb31f62f 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu)
switch (status) {
case 0:
- status = nfs_refresh_inode(inode, res.fattr);
+ nfs_refresh_inode(inode, res.fattr);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 970f28dbf253..b1d2122bd5a7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -671,6 +671,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
struct nfs_client *clp = server->nfs_client;
int ret;
+ if ((task->tk_rpc_status == -ENETDOWN ||
+ task->tk_rpc_status == -ENETUNREACH) &&
+ task->tk_flags & RPC_TASK_NETUNREACH_FATAL) {
+ exception->delay = 0;
+ exception->recovering = 0;
+ exception->retry = 0;
+ return -EIO;
+ }
+
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
int ret2 = nfs4_exception_should_retrans(server, exception);
@@ -7074,10 +7083,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_unlockdata *p;
struct nfs4_state *state = lsp->ls_state;
struct inode *inode = state->inode;
+ struct nfs_lock_context *l_ctx;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
+ l_ctx = nfs_get_lock_context(ctx);
+ if (!IS_ERR(l_ctx)) {
+ p->l_ctx = l_ctx;
+ } else {
+ kfree(p);
+ return NULL;
+ }
p->arg.fh = NFS_FH(inode);
p->arg.fl = &p->fl;
p->arg.seqid = seqid;
@@ -7085,7 +7102,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->lsp = lsp;
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
- p->l_ctx = nfs_get_lock_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
p->server = NFS_SERVER(inode);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index bc67fe6801b1..deab4c0e21a0 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -2051,13 +2051,15 @@ TRACE_EVENT(fl_getdevinfo,
DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_PROTO(
- const struct nfs_pgio_header *hdr
+ const struct nfs_pgio_header *hdr,
+ int error
),
- TP_ARGS(hdr),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(unsigned long, error)
+ __field(unsigned long, nfs_error)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -2073,7 +2075,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_fast_assign(
const struct inode *inode = hdr->inode;
- __entry->error = hdr->res.op_status;
+ __entry->error = -error;
+ __entry->nfs_error = hdr->res.op_status;
__entry->fhandle = nfs_fhandle_hash(hdr->args.fh);
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
@@ -2088,7 +2091,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
+ "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s "
+ "nfs_error=%lu (%s)",
-__entry->error,
show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -2096,28 +2100,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
__entry->fhandle,
__entry->offset, __entry->count,
__entry->stateid_seq, __entry->stateid_hash,
- __get_str(dstaddr)
+ __get_str(dstaddr), __entry->nfs_error,
+ show_nfs4_status(__entry->nfs_error)
)
);
#define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \
DEFINE_EVENT(nfs4_flexfiles_io_event, name, \
TP_PROTO( \
- const struct nfs_pgio_header *hdr \
+ const struct nfs_pgio_header *hdr, \
+ int error \
), \
- TP_ARGS(hdr))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error);
DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error);
TRACE_EVENT(ff_layout_commit_error,
TP_PROTO(
- const struct nfs_commit_data *data
+ const struct nfs_commit_data *data,
+ int error
),
- TP_ARGS(data),
+ TP_ARGS(data, error),
TP_STRUCT__entry(
__field(unsigned long, error)
+ __field(unsigned long, nfs_error)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -2131,7 +2139,8 @@ TRACE_EVENT(ff_layout_commit_error,
TP_fast_assign(
const struct inode *inode = data->inode;
- __entry->error = data->res.op_status;
+ __entry->error = -error;
+ __entry->nfs_error = data->res.op_status;
__entry->fhandle = nfs_fhandle_hash(data->args.fh);
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
@@ -2142,14 +2151,15 @@ TRACE_EVENT(ff_layout_commit_error,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%llu count=%u dstaddr=%s",
+ "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)",
-__entry->error,
show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->offset, __entry->count,
- __get_str(dstaddr)
+ __get_str(dstaddr), __entry->nfs_error,
+ show_nfs4_status(__entry->nfs_error)
)
);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5f582713bf05..3adb7d0dbec7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return remaining;
}
+static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_return_segs, pls_list)
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+}
+
static void
pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
struct list_head *free_me,
@@ -1246,21 +1254,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
static void
pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo,
const nfs4_stateid *arg_stateid,
- const struct pnfs_layout_range *range)
+ const struct pnfs_layout_range *range,
+ struct list_head *freeme)
{
- const struct pnfs_layout_segment *lseg;
- u32 seq = be32_to_cpu(arg_stateid->seqid);
-
if (pnfs_layout_is_valid(lo) &&
- nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) {
- list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) {
- if (pnfs_seqid_is_newer(lseg->pls_seq, seq) ||
- !pnfs_should_free_range(&lseg->pls_range, range))
- continue;
- pnfs_set_plh_return_info(lo, range->iomode, seq);
- break;
- }
- }
+ nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ pnfs_reset_return_info(lo);
+ else
+ pnfs_mark_layout_stateid_invalid(lo, freeme);
+ pnfs_clear_layoutreturn_waitbit(lo);
}
void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
@@ -1268,11 +1270,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
const struct pnfs_layout_range *range)
{
struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
spin_lock(&inode->i_lock);
- pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range);
- pnfs_clear_layoutreturn_waitbit(lo);
+ pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme);
spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
}
void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
@@ -1292,6 +1295,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
pnfs_set_layout_stateid(lo, stateid, NULL, true);
+ pnfs_reset_return_info(lo);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1661,6 +1665,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
/* Was there an RPC level error? If not, retry */
if (task->tk_rpc_status == 0)
break;
+ /*
+ * Is there a fatal network level error?
+ * If so release the layout, but flag the error.
+ */
+ if ((task->tk_rpc_status == -ENETDOWN ||
+ task->tk_rpc_status == -ENETUNREACH) &&
+ task->tk_flags & RPC_TASK_NETUNREACH_FATAL) {
+ *ret = 0;
+ (*respp)->lrs_present = 0;
+ retval = -EIO;
+ break;
+ }
/* If the call was not sent, let caller handle it */
if (!RPC_WAS_SENT(task))
return 0;
@@ -1695,6 +1711,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct inode *inode = args->inode;
const nfs4_stateid *res_stateid = NULL;
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
+ LIST_HEAD(freeme);
switch (ret) {
case -NFS4ERR_BADSESSION:
@@ -1703,9 +1720,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
case -NFS4ERR_NOMATCHING_LAYOUT:
spin_lock(&inode->i_lock);
pnfs_layoutreturn_retry_later_locked(lo, &args->stateid,
- &args->range);
- pnfs_clear_layoutreturn_waitbit(lo);
+ &args->range, &freeme);
spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
break;
case 0:
if (res->lrs_present)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 30d2613e912b..91ff877185c8 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -60,6 +60,7 @@ struct nfs4_pnfs_ds {
struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
char *ds_remotestr; /* comma sep list of addrs */
struct list_head ds_addrs;
+ const struct net *ds_net;
struct nfs_client *ds_clp;
refcount_t ds_count;
unsigned long ds_state;
@@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode,
int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
-struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
+ struct list_head *dsaddrs,
gfp_t gfp_flags);
void nfs4_pnfs_v3_ds_connect_unload(void);
int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index dbef837e871a..91ef486f40b9 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -16,6 +16,7 @@
#include "nfs4session.h"
#include "internal.h"
#include "pnfs.h"
+#include "netns.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -504,14 +505,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
/*
* Data server cache
*
- * Data servers can be mapped to different device ids.
- * nfs4_pnfs_ds reference counting
+ * Data servers can be mapped to different device ids, but should
+ * never be shared between net namespaces.
+ *
+ * nfs4_pnfs_ds reference counting:
* - set to 1 on allocation
* - incremented when a device id maps a data server already in the cache.
* - decremented when deviceid is removed from the cache.
*/
-static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-static LIST_HEAD(nfs4_data_server_cache);
/* Debug routines */
static void
@@ -604,11 +605,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
* Lookup DS by addresses. nfs4_ds_cache_lock is held
*/
static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
+_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs)
{
struct nfs4_pnfs_ds *ds;
- list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)
if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
return ds;
return NULL;
@@ -653,10 +654,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
{
- if (refcount_dec_and_lock(&ds->ds_count,
- &nfs4_ds_cache_lock)) {
+ struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id);
+
+ if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) {
list_del_init(&ds->ds_node);
- spin_unlock(&nfs4_ds_cache_lock);
+ spin_unlock(&nn->nfs4_data_server_lock);
destroy_ds(ds);
}
}
@@ -716,8 +718,9 @@ out_err:
* uncached and return cached struct nfs4_pnfs_ds.
*/
struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags)
{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
char *remotestr;
@@ -733,16 +736,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
/* this is only used for debugging, so it's ok if its NULL */
remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
- spin_lock(&nfs4_ds_cache_lock);
- tmp_ds = _data_server_lookup_locked(dsaddrs);
+ spin_lock(&nn->nfs4_data_server_lock);
+ tmp_ds = _data_server_lookup_locked(nn, dsaddrs);
if (tmp_ds == NULL) {
INIT_LIST_HEAD(&ds->ds_addrs);
list_splice_init(dsaddrs, &ds->ds_addrs);
ds->ds_remotestr = remotestr;
refcount_set(&ds->ds_count, 1);
INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_net = net;
ds->ds_clp = NULL;
- list_add(&ds->ds_node, &nfs4_data_server_cache);
+ list_add(&ds->ds_node, &nn->nfs4_data_server_cache);
dprintk("%s add new data server %s\n", __func__,
ds->ds_remotestr);
} else {
@@ -754,7 +758,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
refcount_read(&tmp_ds->ds_count));
ds = tmp_ds;
}
- spin_unlock(&nfs4_ds_cache_lock);
+ spin_unlock(&nn->nfs4_data_server_lock);
out:
return ds;
}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 1c62a5a9f51d..58146e935402 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -40,31 +40,31 @@ static const char *nfs_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page;
+ struct folio *folio;
void *err;
if (!dentry) {
err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
if (err)
return err;
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
return err;
- page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
+ folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler,
NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- set_delayed_call(done, page_put_link, page);
- return page_address(page);
+ set_delayed_call(done, page_put_link, folio);
+ return folio_address(folio);
}
/*
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bf77399696a7..b55467911648 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -464,18 +464,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
sdentry = NULL;
do {
- int slen;
dput(sdentry);
sillycounter++;
- slen = scnprintf(silly, sizeof(silly),
- SILLYNAME_PREFIX "%0*llx%0*x",
- SILLYNAME_FILEID_LEN, fileid,
- SILLYNAME_COUNTER_LEN, sillycounter);
+ scnprintf(silly, sizeof(silly),
+ SILLYNAME_PREFIX "%0*llx%0*x",
+ SILLYNAME_FILEID_LEN, fileid,
+ SILLYNAME_COUNTER_LEN, sillycounter);
dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
dentry, silly);
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
/*
* N.B. Better to return EBUSY here ... it could be
* dangerous to delete the file while it's in use.
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 372bdcf5e07a..ac1731eb34ab 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -284,7 +284,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
inode_lock_nested(inode, I_MUTEX_PARENT);
- child = lookup_one_len(argp->name, parent, argp->len);
+ child = lookup_one(&nop_mnt_idmap,
+ &QSTR_LEN(argp->name, argp->len),
+ parent);
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
goto out;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index a7a07470c1f8..ef4971d71ac4 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1001,7 +1001,9 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_positive_unlocked(name, dparent, namlen);
+ dchild = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, namlen),
+ dparent);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b397246dae7b..fd560dcf6059 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,7 +266,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
inode_lock_nested(inode, I_MUTEX_PARENT);
- child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
+ child = lookup_one(&nop_mnt_idmap,
+ &QSTR_LEN(open->op_fname, open->op_fnamelen),
+ parent);
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
goto out;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c1d9bd07285f..acde3edab733 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -218,7 +218,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
/* lock the parent */
inode_lock(d_inode(dir));
- dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -316,7 +316,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
struct dentry *dentry;
- dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+ dentry = lookup_one(&nop_mnt_idmap,
+ &QSTR(entry->name), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
break;
@@ -339,16 +340,16 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
static int
-nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
+nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn)
{
struct dentry *dir, *dentry;
int status;
- dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+ dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name);
dir = nn->rec_file->f_path.dentry;
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- dentry = lookup_one_len(name, dir, namlen);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -408,7 +409,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
if (status < 0)
goto out_drop_write;
- status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
+ status = nfsd4_unlink_clid_dir(dname, nn);
nfs4_reset_creds(original_cred);
if (status == 0) {
vfs_fsync(nn->rec_file, 0);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e67420729ecd..fe876395985a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3812,7 +3812,9 @@ nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, namlen),
+ cd->rd_fhp->fh_dentry);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6dda081eb24c..6370ac0a85fd 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -312,7 +312,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
}
inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+ dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len),
+ dirfhp->fh_dentry);
if (IS_ERR(dchild)) {
resp->status = nfserrno(PTR_ERR(dchild));
goto out_unlock;
@@ -331,7 +332,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
resp->status = nfserr_acces;
if (!newfhp->fh_dentry) {
- printk(KERN_WARNING
+ printk(KERN_WARNING
"nfsd_proc_create: file handle not verified\n");
goto out_unlock;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9abdc4b75813..160a839af405 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -264,7 +264,8 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
} else {
- dentry = lookup_one_len_unlocked(name, dparent, len);
+ dentry = lookup_one_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, len), dparent);
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
@@ -922,7 +923,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
* directories, but we never have and it doesn't seem to have
* caused anyone a problem. If we were to change this, note
* also that our filldir callbacks would need a variant of
- * lookup_one_len that doesn't check permissions.
+ * lookup_one_positive_unlocked() that doesn't check permissions.
*/
if (type == S_IFREG)
may_flags |= NFSD_MAY_OWNER_OVERRIDE;
@@ -1554,7 +1555,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfserrno(host_err);
inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one_len(fname, dentry, flen);
+ dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
host_err = PTR_ERR(dchild);
if (IS_ERR(dchild)) {
err = nfserrno(host_err);
@@ -1659,7 +1660,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dnew = lookup_one_len(fname, dentry, flen);
+ dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
if (IS_ERR(dnew)) {
err = nfserrno(PTR_ERR(dnew));
inode_unlock(dentry->d_inode);
@@ -1734,7 +1735,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dirp = d_inode(ddir);
inode_lock_nested(dirp, I_MUTEX_PARENT);
- dnew = lookup_one_len(name, ddir, len);
+ dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir);
if (IS_ERR(dnew)) {
host_err = PTR_ERR(dnew);
goto out_unlock;
@@ -1867,7 +1868,7 @@ retry:
if (err != nfs_ok)
goto out_unlock;
- odentry = lookup_one_len(fname, fdentry, flen);
+ odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry);
host_err = PTR_ERR(odentry);
if (IS_ERR(odentry))
goto out_nfserr;
@@ -1880,7 +1881,7 @@ retry:
goto out_dput_old;
type = d_inode(odentry)->i_mode & S_IFMT;
- ndentry = lookup_one_len(tname, tdentry, tlen);
+ ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry);
host_err = PTR_ERR(ndentry);
if (IS_ERR(ndentry))
goto out_dput_old;
@@ -1998,7 +1999,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dirp = d_inode(dentry);
inode_lock_nested(dirp, I_MUTEX_PARENT);
- rdentry = lookup_one_len(fname, dentry, flen);
+ rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
goto out_unlock;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d6cd81163030..135c49c5d848 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -9,12 +9,13 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/cred.h>
-#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/crc-itu-t.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "omfs.h"
MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
@@ -384,79 +385,83 @@ nomem:
return -ENOMEM;
}
+struct omfs_mount_options {
+ kuid_t s_uid;
+ kgid_t s_gid;
+ int s_dmask;
+ int s_fmask;
+};
+
enum {
- Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err
+ Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask,
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_dmask, "dmask=%o"},
- {Opt_fmask, "fmask=%o"},
- {Opt_err, NULL},
+static const struct fs_parameter_spec omfs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_u32oct ("dmask", Opt_dmask),
+ fsparam_u32oct ("fmask", Opt_fmask),
+ {}
};
-static int parse_options(char *options, struct omfs_sb_info *sbi)
+static int
+omfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(sbi->s_uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(sbi->s_gid))
- return 0;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_fmask = sbi->s_dmask = option;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_dmask = option;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_fmask = option;
- break;
- default:
- return 0;
- }
+ struct omfs_mount_options *opts = fc->fs_private;
+ int token;
+ struct fs_parse_result result;
+
+ /* All options are ignored on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ token = fs_parse(fc, omfs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_uid:
+ opts->s_uid = result.uid;
+ break;
+ case Opt_gid:
+ opts->s_gid = result.gid;
+ break;
+ case Opt_umask:
+ opts->s_fmask = opts->s_dmask = result.uint_32;
+ break;
+ case Opt_dmask:
+ opts->s_dmask = result.uint_32;
+ break;
+ case Opt_fmask:
+ opts->s_fmask = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+
+ return 0;
}
-static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+static void
+omfs_set_options(struct omfs_sb_info *sbi, struct omfs_mount_options *opts)
+{
+ sbi->s_uid = opts->s_uid;
+ sbi->s_gid = opts->s_gid;
+ sbi->s_dmask = opts->s_dmask;
+ sbi->s_fmask = opts->s_fmask;
+}
+
+static int omfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct buffer_head *bh, *bh2;
struct omfs_super_block *omfs_sb;
struct omfs_root_block *omfs_rb;
struct omfs_sb_info *sbi;
struct inode *root;
+ struct omfs_mount_options *parsed_opts = fc->fs_private;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -464,12 +469,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbi;
- sbi->s_uid = current_uid();
- sbi->s_gid = current_gid();
- sbi->s_dmask = sbi->s_fmask = current_umask();
-
- if (!parse_options((char *) data, sbi))
- goto end;
+ omfs_set_options(sbi, parsed_opts);
sb->s_maxbytes = 0xffffffff;
@@ -594,18 +594,50 @@ end:
return ret;
}
-static struct dentry *omfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int omfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, omfs_fill_super);
+}
+
+static void omfs_free_fc(struct fs_context *fc);
+
+static const struct fs_context_operations omfs_context_ops = {
+ .parse_param = omfs_parse_param,
+ .get_tree = omfs_get_tree,
+ .free = omfs_free_fc,
+};
+
+static int omfs_init_fs_context(struct fs_context *fc)
+{
+ struct omfs_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Set mount options defaults */
+ opts->s_uid = current_uid();
+ opts->s_gid = current_gid();
+ opts->s_dmask = opts->s_fmask = current_umask();
+
+ fc->fs_private = opts;
+ fc->ops = &omfs_context_ops;
+
+ return 0;
+}
+
+static void omfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
+ kfree(fc->fs_private);
}
static struct file_system_type omfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "omfs",
- .mount = omfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "omfs",
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = omfs_init_fs_context,
+ .parameters = omfs_param_spec,
};
MODULE_ALIAS_FS("omfs");
diff --git a/fs/open.c b/fs/open.c
index a9063cca9911..7828234a7caa 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -60,7 +60,10 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
- inode_lock(dentry->d_inode);
+ ret = inode_lock_killable(dentry->d_inode);
+ if (ret)
+ return ret;
+
/* Note any delegations or leases have already been broken: */
ret = notify_change(idmap, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
@@ -635,7 +638,9 @@ int chmod_common(const struct path *path, umode_t mode)
if (error)
return error;
retry_deleg:
- inode_lock(inode);
+ error = inode_lock_killable(inode);
+ if (error)
+ goto out_mnt_unlock;
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
@@ -650,6 +655,7 @@ out_unlock:
if (!error)
goto retry_deleg;
}
+out_mnt_unlock:
mnt_drop_write(path->mnt);
return error;
}
@@ -769,7 +775,9 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- inode_lock(inode);
+ error = inode_lock_killable(inode);
+ if (error)
+ return error;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
setattr_should_drop_sgid(idmap, inode);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 5ac743c6bc2e..08a6f372a352 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -32,12 +32,13 @@ static int orangefs_writepage_locked(struct folio *folio,
len = i_size_read(inode);
if (folio->private) {
wr = folio->private;
- WARN_ON(wr->pos >= len);
off = wr->pos;
- if (off + wr->len > len)
+ if ((off + wr->len > len) && (off <= len))
wlen = len - off;
else
wlen = wr->len;
+ if (wlen == 0)
+ wlen = wr->len;
} else {
WARN_ON(1);
off = folio_pos(folio);
@@ -46,8 +47,6 @@ static int orangefs_writepage_locked(struct folio *folio,
if (wlen > len - off)
wlen = len - off;
}
- /* Should've been handled in orangefs_invalidate_folio. */
- WARN_ON(off == len || off + wlen > len);
WARN_ON(wlen == 0);
bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off));
@@ -320,6 +319,8 @@ static int orangefs_write_begin(struct file *file,
wr->len += len;
goto okay;
} else {
+ wr->pos = pos;
+ wr->len = len;
ret = orangefs_launder_folio(folio);
if (ret)
return ret;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 444aeeccb6da..83f80fdb1567 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -385,11 +385,9 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
/*
- * No idmap handling here: it's an internal lookup. Could skip
- * permission checking altogether, but for now just use non-idmap
- * transformed ids.
+ * No idmap handling here: it's an internal lookup.
*/
- this = lookup_one_len(name.name.name, connected, name.name.len);
+ this = lookup_noperm(&name.name, connected);
release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index be5c65d6f848..bf722daf19a9 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -205,8 +205,8 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
- base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt),
+ &QSTR_LEN(name, len), base);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -757,7 +757,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
if (err)
return ERR_PTR(err);
- index = lookup_positive_unlocked(name.name, ofs->workdir, name.len);
+ index = lookup_noperm_positive_unlocked(&name, ofs->workdir);
kfree(name.name);
if (IS_ERR(index)) {
if (PTR_ERR(index) == -ENOENT)
@@ -789,8 +789,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
- index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name,
- ofs->workdir, name.len);
+ index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name,
+ ofs->workdir);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -1371,7 +1371,7 @@ out:
bool ovl_lower_positive(struct dentry *dentry)
{
struct ovl_entry *poe = OVL_E(dentry->d_parent);
- const struct qstr *name = &dentry->d_name;
+ struct qstr *name = &dentry->d_name;
const struct cred *old_cred;
unsigned int i;
bool positive = false;
@@ -1396,7 +1396,7 @@ bool ovl_lower_positive(struct dentry *dentry)
this = lookup_one_positive_unlocked(
mnt_idmap(parentpath->layer->mnt),
- name->name, parentpath->dentry, name->len);
+ name, parentpath->dentry);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
case -ENOENT:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index aef942a758ce..8baaba0a3fe5 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -402,7 +402,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
const char *name,
struct dentry *base, int len)
{
- return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len);
+ return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base);
}
static inline bool ovl_open_flags_need_copy_up(int flags)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 881ec5592da5..44e208da417c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -271,7 +271,6 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
int err;
- struct ovl_cache_entry *p;
struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
@@ -280,9 +279,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
err = down_write_killable(&dir->d_inode->i_rwsem);
if (!err) {
while (rdd->first_maybe_whiteout) {
- p = rdd->first_maybe_whiteout;
+ struct ovl_cache_entry *p =
+ rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
- dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
+ dentry = lookup_one(mnt_idmap(path->mnt),
+ &QSTR_LEN(p->name, p->len), dir);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
@@ -351,6 +352,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -492,7 +494,7 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p,
}
}
/* This checks also for xwhiteouts */
- this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
+ this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
p->is_whiteout = true;
@@ -571,6 +573,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -672,6 +675,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -686,7 +690,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
name, namelen, rdt->xinowarn);
}
- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}
static bool ovl_is_impure_dir(struct file *file)
@@ -713,6 +720,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1073,6 +1081,7 @@ int ovl_check_d_type_supported(const struct path *realpath)
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};
@@ -1094,6 +1103,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1178,6 +1188,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
diff --git a/fs/pidfs.c b/fs/pidfs.c
index d64a4cbeb0da..c1f0a067be40 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -20,6 +20,7 @@
#include <linux/time_namespace.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
+#include <linux/coredump.h>
#include "internal.h"
#include "mount.h"
@@ -33,6 +34,7 @@ static struct kmem_cache *pidfs_cachep __ro_after_init;
struct pidfs_exit_info {
__u64 cgroupid;
__s32 exit_code;
+ __u32 coredump_mask;
};
struct pidfs_inode {
@@ -240,6 +242,22 @@ static inline bool pid_in_current_pidns(const struct pid *pid)
return false;
}
+static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+{
+ switch (__get_dumpable(mm_flags)) {
+ case SUID_DUMP_USER:
+ return PIDFD_COREDUMP_USER;
+ case SUID_DUMP_ROOT:
+ return PIDFD_COREDUMP_ROOT;
+ case SUID_DUMP_DISABLE:
+ return PIDFD_COREDUMP_SKIP;
+ default:
+ WARN_ON_ONCE(true);
+ }
+
+ return 0;
+}
+
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
@@ -280,6 +298,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
}
}
+ if (mask & PIDFD_INFO_COREDUMP) {
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ }
+
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
/*
@@ -296,6 +319,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!c)
return -ESRCH;
+ if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) {
+ task_lock(task);
+ if (task->mm)
+ kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
+ task_unlock(task);
+ }
+
/* Unconditionally return identifiers and credentials, the rest only on request */
user_ns = current_user_ns();
@@ -559,6 +589,31 @@ void pidfs_exit(struct task_struct *tsk)
}
}
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm)
+{
+ struct pid *pid = cprm->pid;
+ struct pidfs_exit_info *exit_info;
+ struct dentry *dentry;
+ struct inode *inode;
+ __u32 coredump_mask = 0;
+
+ dentry = pid->stashed;
+ if (WARN_ON_ONCE(!dentry))
+ return;
+
+ inode = d_inode(dentry);
+ exit_info = &pidfs_i(inode)->__pei;
+ /* Note how we were coredumped. */
+ coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
+ /* Note that we actually did coredump. */
+ coredump_mask |= PIDFD_COREDUMPED;
+ /* If coredumping is set to skip we should never end up here. */
+ VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
+ smp_store_release(&exit_info->coredump_mask, coredump_mask);
+}
+#endif
+
static struct vfsmount *pidfs_mnt __ro_after_init;
/*
@@ -569,36 +624,14 @@ static struct vfsmount *pidfs_mnt __ro_after_init;
static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
- return -EOPNOTSUPP;
+ return anon_inode_setattr(idmap, dentry, attr);
}
-
-/*
- * User space expects pidfs inodes to have no file type in st_mode.
- *
- * In particular, 'lsof' has this legacy logic:
- *
- * type = s->st_mode & S_IFMT;
- * switch (type) {
- * ...
- * case 0:
- * if (!strcmp(p, "anon_inode"))
- * Lf->ntype = Ntype = N_ANON_INODE;
- *
- * to detect our old anon_inode logic.
- *
- * Rather than mess with our internal sane inode data, just fix it
- * up here in getattr() by masking off the format bits.
- */
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
- struct inode *inode = d_inode(path->dentry);
-
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- stat->mode &= ~S_IFMT;
- return 0;
+ return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
}
static const struct inode_operations pidfs_inode_operations = {
@@ -768,7 +801,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
{
enum pid_type type;
- if (flags & PIDFD_CLONE)
+ if (flags & PIDFD_STALE)
return true;
/*
@@ -777,10 +810,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
* pidfd has been allocated perform another check that the pid
* is still alive. If it is exit information is available even
* if the task gets reaped before the pidfd is returned to
- * userspace. The only exception is PIDFD_CLONE where no task
- * linkage has been established for @pid yet and the kernel is
- * in the middle of process creation so there's nothing for
- * pidfs to miss.
+ * userspace. The only exception are indicated by PIDFD_STALE:
+ *
+ * (1) The kernel is in the middle of task creation and thus no
+ * task linkage has been established yet.
+ * (2) The caller knows @pid has been registered in pidfs at a
+ * time when the task was still alive.
+ *
+ * In both cases exit information will have been reported.
*/
if (flags & PIDFD_THREAD)
type = PIDTYPE_PID;
@@ -826,7 +863,7 @@ static int pidfs_init_inode(struct inode *inode, void *data)
const struct pid *pid = data;
inode->i_private = data;
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
@@ -874,11 +911,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
int ret;
/*
- * Ensure that PIDFD_CLONE can be passed as a flag without
+ * Ensure that PIDFD_STALE can be passed as a flag without
* overloading other uapi pidfd flags.
*/
- BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
- BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
@@ -887,7 +924,8 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
if (!pidfs_pid_valid(pid, &path, flags))
return ERR_PTR(-ESRCH);
- flags &= ~PIDFD_CLONE;
+ flags &= ~PIDFD_STALE;
+ flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
if (!IS_ERR(pidfd_file))
@@ -896,6 +934,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
return pidfd_file;
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct path path __free(path_put) = {};
+ int ret;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+ if (unlikely(ret))
+ return ret;
+ /* Keep the dentry and only put the reference to the mount. */
+ path.dentry = NULL;
+ return 0;
+}
+
+/**
+ * pidfs_get_pid - pin a struct pid through pidfs
+ * @pid: pid to pin
+ *
+ * Similar to pidfs_register_pid() but only valid if the caller knows
+ * there's a reference to the @pid through a dentry already that can't
+ * go away.
+ */
+void pidfs_get_pid(struct pid *pid)
+{
+ if (!pid)
+ return;
+ WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
+}
+
+/**
+ * pidfs_put_pid - drop a pidfs reference
+ * @pid: pid to drop
+ *
+ * Drop a reference to @pid via pidfs. This is only safe if the
+ * reference has been taken via pidfs_get_pid().
+ */
+void pidfs_put_pid(struct pid *pid)
+{
+ might_sleep();
+
+ if (!pid)
+ return;
+ VFS_WARN_ON_ONCE(!pid->stashed);
+ dput(pid->stashed);
+}
+
static void pidfs_inode_init_once(void *data)
{
struct pidfs_inode *pi = data;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b0d4e1908b22..fe33a5843fbd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2121,7 +2121,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
unsigned type = DT_UNKNOWN;
ino_t ino = 1;
- child = d_hash_and_lookup(dir, &qname);
+ child = try_lookup_noperm(&qname, dir);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 83be312159c9..bc2bc60c36cc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,8 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
- show_val_kb(m, "Bounce: ",
- global_zone_page_state(NR_BOUNCE));
+ show_val_kb(m, "Bounce: ", 0);
show_val_kb(m, "WritebackTmp: ",
global_node_page_state(NR_WRITEBACK_TEMP));
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e133b507ddf3..5c555db68aa2 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_puts(m, sb_rdonly(sb) ? " ro" : " rw");
err = show_sb_opts(m, sb);
@@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
int err;
/* device */
+ seq_puts(m, "device ");
if (sb->s_op->show_devname) {
- seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
if (err)
goto out;
} else {
- if (r->mnt_devname) {
- seq_puts(m, "device ");
- mangle(m, r->mnt_devname);
- } else
- seq_puts(m, "no device");
+ mangle(m, r->mnt_devname);
}
/* mount point */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 825c5c2e0962..df4a9b348769 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2560,7 +2560,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
+ dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/read_write.c b/fs/read_write.c
index bb0ed26a0b3a..0ef70e128c4a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -332,7 +332,9 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
loff_t retval;
- inode_lock(inode);
+ retval = inode_lock_killable(inode);
+ if (retval)
+ return retval;
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);
diff --git a/fs/readdir.c b/fs/readdir.c
index 0038efda417b..7764b8638978 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -252,7 +253,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -266,12 +266,16 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -279,7 +283,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
return false;
}
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@@ -296,7 +300,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -311,7 +315,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -329,7 +333,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -338,7 +342,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -351,15 +354,19 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
@@ -376,7 +383,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
@@ -392,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -411,7 +418,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -475,6 +482,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -499,7 +507,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -513,12 +520,16 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
namlen + 2, sizeof(compat_long_t));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -526,7 +537,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
return false;
}
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@@ -542,7 +553,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -557,8 +568,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;
@@ -575,7 +586,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
diff --git a/fs/select.c b/fs/select.c
index 7da531b1cf6b..9fb650d03d52 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -630,7 +630,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
- if (n < 0)
+ if (unlikely(n < 0))
goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */
@@ -857,7 +857,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
int fd = pollfd->fd;
__poll_t mask, filter;
- if (fd < 0)
+ if (unlikely(fd < 0))
return 0;
CLASS(fd, f)(fd);
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 240d82c6f908..89d2dbbb742c 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -102,7 +102,8 @@ path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path)
while (*s && *s != sep)
s++;
- child = lookup_positive_unlocked(p, dentry, s - p);
+ child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p),
+ dentry);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -201,7 +202,7 @@ replay_again:
spin_unlock(&cfids->cfid_list_lock);
/*
- * Skip any prefix paths in @path as lookup_positive_unlocked() ends up
+ * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up
* calling ->lookup() which already adds those through
* build_path_from_dentry(). Also, do it earlier as we might reconnect
* below when trying to send compounded request and then potentially
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index a08c42363ffc..fb04e263611c 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -929,7 +929,8 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
while (*s && *s != sep)
s++;
- child = lookup_positive_unlocked(p, dentry, s - p);
+ child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p),
+ dentry);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 851b74f557c1..950aa4f912f5 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -160,8 +160,10 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
- cifs_negotiate_rsize(server, cifs_sb->ctx,
- tlink_tcon(req->cfile->tlink));
+ if (cifs_sb->ctx->rsize == 0) {
+ cifs_negotiate_rsize(server, cifs_sb->ctx,
+ tlink_tcon(req->cfile->tlink));
+ }
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&size, &rdata->credits);
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 50f96259d9ad..f9f11cbf89be 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -9,6 +9,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/stat.h>
@@ -78,7 +79,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
- dentry = d_hash_and_lookup(parent, name);
+ dentry = try_lookup_noperm(name, parent);
if (!dentry) {
/*
* If we know that the inode will need to be revalidated
@@ -733,7 +734,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
else
cifs_buf_release(cfile->srch_inf.
ntwrk_buf_start);
+ /* Reset all pointers to the network buffer to prevent stale references */
cfile->srch_inf.ntwrk_buf_start = NULL;
+ cfile->srch_inf.srch_entries_start = NULL;
+ cfile->srch_inf.last_entry = NULL;
}
rc = initiate_cifs_search(xid, file, full_path);
if (rc) {
@@ -756,11 +760,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
search_flags,
&cfile->srch_inf);
+ if (rc)
+ return -ENOENT;
/* FindFirst/Next set last_entry to NULL on malformed reply */
if (cfile->srch_inf.last_entry)
cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
- if (rc)
- return -ENOENT;
}
if (index_to_find < cfile->srch_inf.index_of_last_entry) {
/* we found the buffer that contains the entry */
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 0b35816d551f..4e28632b5fd6 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -2968,7 +2968,7 @@ replay_again:
/* Eventually save off posix specific response info and timestamps */
err_free_rsp_buf:
- free_rsp_buf(resp_buftype, rsp);
+ free_rsp_buf(resp_buftype, rsp_iov.iov_base);
kfree(pc_buf);
err_free_req:
cifs_small_buf_release(req);
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 03f606afad93..d7a8a580d013 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -146,12 +146,9 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
{
struct oplock_info *opinfo;
- if (list_empty(&ci->m_op_list))
- return NULL;
-
down_read(&ci->m_lock);
- opinfo = list_first_entry(&ci->m_op_list, struct oplock_info,
- op_entry);
+ opinfo = list_first_entry_or_null(&ci->m_op_list, struct oplock_info,
+ op_entry);
if (opinfo) {
if (opinfo->conn == NULL ||
!atomic_inc_not_zero(&opinfo->refcount))
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index f2a2be8467c6..8d414239b3fe 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -4120,9 +4120,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
return -EINVAL;
lock_dir(priv->dir_fp);
- dent = lookup_one(idmap, priv->d_info->name,
- priv->dir_fp->filp->f_path.dentry,
- priv->d_info->name_len);
+ dent = lookup_one(idmap,
+ &QSTR_LEN(priv->d_info->name,
+ priv->d_info->name_len),
+ priv->dir_fp->filp->f_path.dentry);
unlock_dir(priv->dir_fp);
if (IS_ERR(dent)) {
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 482eba0f4dc1..baf0d3031a44 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -409,10 +409,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n",
*pos, count);
+ if (*pos >= XATTR_SIZE_MAX) {
+ pr_err("stream write position %lld is out of bounds\n", *pos);
+ return -EINVAL;
+ }
+
size = *pos + count;
if (size > XATTR_SIZE_MAX) {
size = XATTR_SIZE_MAX;
- count = (*pos + count) - XATTR_SIZE_MAX;
+ count = XATTR_SIZE_MAX - *pos;
}
v_len = ksmbd_vfs_getcasexattr(idmap,
@@ -426,13 +431,6 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
goto out;
}
- if (v_len <= *pos) {
- pr_err("stream write position %lld is out of bounds (stream length: %zd)\n",
- *pos, v_len);
- err = -EINVAL;
- goto out;
- }
-
if (v_len < size) {
wbuf = kvzalloc(size, KSMBD_DEFAULT_GFP);
if (!wbuf) {
@@ -684,7 +682,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
struct ksmbd_file *parent_fp;
int new_type;
int err, lookup_flags = LOOKUP_NO_SYMLINKS;
- int target_lookup_flags = LOOKUP_RENAME_TARGET;
+ int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
if (ksmbd_override_fsids(work))
return -ENOMEM;
diff --git a/fs/stat.c b/fs/stat.c
index 3d9222807214..52c604ebbff8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -254,7 +254,7 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
int retval;
retval = security_inode_getattr(path);
- if (retval)
+ if (unlikely(retval))
return retval;
return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
@@ -425,7 +425,7 @@ SYSCALL_DEFINE2(stat, const char __user *, filename,
int error;
error = vfs_stat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@@ -438,7 +438,7 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@@ -447,12 +447,13 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_fstat(fd, &stat);
+ int error;
- if (!error)
- error = cp_old_stat(&stat, statbuf);
+ error = vfs_fstat(fd, &stat);
+ if (unlikely(error))
+ return error;
- return error;
+ return cp_old_stat(&stat, statbuf);
}
#endif /* __ARCH_WANT_OLD_STAT */
@@ -506,10 +507,12 @@ SYSCALL_DEFINE2(newstat, const char __user *, filename,
struct stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_stat(filename, &stat);
+ int error;
- if (error)
+ error = vfs_stat(filename, &stat);
+ if (unlikely(error))
return error;
+
return cp_new_stat(&stat, statbuf);
}
@@ -520,7 +523,7 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_new_stat(&stat, statbuf);
@@ -534,8 +537,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
int error;
error = vfs_fstatat(dfd, filename, &stat, flag);
- if (error)
+ if (unlikely(error))
return error;
+
return cp_new_stat(&stat, statbuf);
}
#endif
@@ -543,12 +547,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_fstat(fd, &stat);
+ int error;
- if (!error)
- error = cp_new_stat(&stat, statbuf);
+ error = vfs_fstat(fd, &stat);
+ if (unlikely(error))
+ return error;
- return error;
+ return cp_new_stat(&stat, statbuf);
}
#endif
diff --git a/fs/super.c b/fs/super.c
index 97a17f9d9023..bcc4e87123c8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,8 @@
#include <uapi/linux/mount.h>
#include "internal.h"
-static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -201,7 +202,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
- total_objects = dentries + inodes + fs_objects + 1;
+ total_objects = dentries + inodes + fs_objects;
if (!total_objects)
total_objects = 1;
@@ -887,52 +888,48 @@ void drop_super_exclusive(struct super_block *sb)
}
EXPORT_SYMBOL(drop_super_exclusive);
-static void __iterate_supers(void (*f)(struct super_block *))
-{
- struct super_block *sb, *p = NULL;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (super_flags(sb, SB_DYING))
- continue;
- sb->s_count++;
- spin_unlock(&sb_lock);
+enum super_iter_flags_t {
+ SUPER_ITER_EXCL = (1U << 0),
+ SUPER_ITER_UNLOCKED = (1U << 1),
+ SUPER_ITER_REVERSE = (1U << 2),
+};
- f(sb);
+static inline struct super_block *first_super(enum super_iter_flags_t flags)
+{
+ if (flags & SUPER_ITER_REVERSE)
+ return list_last_entry(&super_blocks, struct super_block, s_list);
+ return list_first_entry(&super_blocks, struct super_block, s_list);
+}
- spin_lock(&sb_lock);
- if (p)
- __put_super(p);
- p = sb;
- }
- if (p)
- __put_super(p);
- spin_unlock(&sb_lock);
+static inline struct super_block *next_super(struct super_block *sb,
+ enum super_iter_flags_t flags)
+{
+ if (flags & SUPER_ITER_REVERSE)
+ return list_prev_entry(sb, s_list);
+ return list_next_entry(sb, s_list);
}
-/**
- * iterate_supers - call function for all active superblocks
- * @f: function to call
- * @arg: argument to pass to it
- *
- * Scans the superblock list and calls given function, passing it
- * locked superblock and given argument.
- */
-void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+
+static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
+ enum super_iter_flags_t flags)
{
struct super_block *sb, *p = NULL;
+ bool excl = flags & SUPER_ITER_EXCL;
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- bool locked;
+ guard(spinlock)(&sb_lock);
+ for (sb = first_super(flags);
+ !list_entry_is_head(sb, &super_blocks, s_list);
+ sb = next_super(sb, flags)) {
+ if (super_flags(sb, SB_DYING))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);
- locked = super_lock_shared(sb);
- if (locked) {
- if (sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ if (flags & SUPER_ITER_UNLOCKED) {
+ f(sb, arg);
+ } else if (super_lock(sb, excl)) {
+ f(sb, arg);
+ super_unlock(sb, excl);
}
spin_lock(&sb_lock);
@@ -942,7 +939,11 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
}
if (p)
__put_super(p);
- spin_unlock(&sb_lock);
+}
+
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+ __iterate_supers(f, arg, 0);
}
/**
@@ -963,15 +964,15 @@ void iterate_supers_type(struct file_system_type *type,
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
bool locked;
+ if (super_flags(sb, SB_DYING))
+ continue;
+
sb->s_count++;
spin_unlock(&sb_lock);
locked = super_lock_shared(sb);
- if (locked) {
- if (sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
- }
+ if (locked)
+ f(sb, arg);
spin_lock(&sb_lock);
if (p)
@@ -991,23 +992,21 @@ struct super_block *user_get_super(dev_t dev, bool excl)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_dev == dev) {
- bool locked;
-
- sb->s_count++;
- spin_unlock(&sb_lock);
- /* still alive? */
- locked = super_lock(sb, excl);
- if (locked) {
- if (sb->s_root)
- return sb;
- super_unlock(sb, excl);
- }
- /* nope, got unmounted */
- spin_lock(&sb_lock);
- __put_super(sb);
- break;
- }
+ bool locked;
+
+ if (sb->s_dev != dev)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+
+ locked = super_lock(sb, excl);
+ if (locked)
+ return sb;
+
+ spin_lock(&sb_lock);
+ __put_super(sb);
+ break;
}
spin_unlock(&sb_lock);
return NULL;
@@ -1111,11 +1110,9 @@ cancel_readonly:
return retval;
}
-static void do_emergency_remount_callback(struct super_block *sb)
+static void do_emergency_remount_callback(struct super_block *sb, void *unused)
{
- bool locked = super_lock_excl(sb);
-
- if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
+ if (sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1126,13 +1123,12 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- if (locked)
- super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
{
- __iterate_supers(do_emergency_remount_callback);
+ __iterate_supers(do_emergency_remount_callback, NULL,
+ SUPER_ITER_EXCL | SUPER_ITER_REVERSE);
kfree(work);
printk("Emergency Remount complete\n");
}
@@ -1148,24 +1144,18 @@ void emergency_remount(void)
}
}
-static void do_thaw_all_callback(struct super_block *sb)
+static void do_thaw_all_callback(struct super_block *sb, void *unused)
{
- bool locked = super_lock_excl(sb);
-
- if (locked && sb->s_root) {
- if (IS_ENABLED(CONFIG_BLOCK))
- while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
- pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
- thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
- return;
- }
- if (locked)
- super_unlock_excl(sb);
+ if (IS_ENABLED(CONFIG_BLOCK))
+ while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
+ pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
+ thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return;
}
static void do_thaw_all(struct work_struct *work)
{
- __iterate_supers(do_thaw_all_callback);
+ __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL);
kfree(work);
printk(KERN_WARNING "Emergency Thaw complete\n");
}
@@ -1186,6 +1176,66 @@ void emergency_thaw_all(void)
}
}
+static inline bool get_active_super(struct super_block *sb)
+{
+ bool active = false;
+
+ if (super_lock_excl(sb)) {
+ active = atomic_inc_not_zero(&sb->s_active);
+ super_unlock_excl(sb);
+ }
+ return active;
+}
+
+static const char *filesystems_freeze_ptr = "filesystems_freeze";
+
+static void filesystems_freeze_callback(struct super_block *sb, void *unused)
+{
+ if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
+ return;
+
+ if (!get_active_super(sb))
+ return;
+
+ if (sb->s_op->freeze_super)
+ sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+ else
+ freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+
+ deactivate_super(sb);
+}
+
+void filesystems_freeze(void)
+{
+ __iterate_supers(filesystems_freeze_callback, NULL,
+ SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
+}
+
+static void filesystems_thaw_callback(struct super_block *sb, void *unused)
+{
+ if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
+ return;
+
+ if (!get_active_super(sb))
+ return;
+
+ if (sb->s_op->thaw_super)
+ sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+ else
+ thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+
+ deactivate_super(sb);
+}
+
+void filesystems_thaw(void)
+{
+ __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED);
+}
+
static DEFINE_IDA(unnamed_dev_ida);
/**
@@ -1479,10 +1529,10 @@ static int fs_bdev_freeze(struct block_device *bdev)
if (sb->s_op->freeze_super)
error = sb->s_op->freeze_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
else
error = freeze_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
if (!error)
error = sync_blockdev(bdev);
deactivate_super(sb);
@@ -1528,10 +1578,10 @@ static int fs_bdev_thaw(struct block_device *bdev)
if (sb->s_op->thaw_super)
error = sb->s_op->thaw_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
else
error = thaw_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
deactivate_super(sb);
return error;
}
@@ -1903,7 +1953,7 @@ static int wait_for_partially_frozen(struct super_block *sb)
}
#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
-#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
+#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL)
static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
{
@@ -1929,11 +1979,34 @@ static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}
-static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
+static inline bool may_freeze(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
+ lockdep_assert_held(&sb->s_umount);
+
WARN_ON_ONCE((who & ~FREEZE_FLAGS));
WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+ if (who & FREEZE_EXCL) {
+ if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(!freeze_owner))
+ return false;
+ /* This freeze already has a specific owner. */
+ if (sb->s_writers.freeze_owner)
+ return false;
+ /*
+ * This is already frozen multiple times so we're just
+ * going to take a reference count and mark the freeze as
+ * being owned by the caller.
+ */
+ if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount)
+ sb->s_writers.freeze_owner = freeze_owner;
+ return true;
+ }
+
if (who & FREEZE_HOLDER_KERNEL)
return (who & FREEZE_MAY_NEST) ||
sb->s_writers.freeze_kcount == 0;
@@ -1943,10 +2016,61 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
return false;
}
+static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
+{
+ lockdep_assert_held(&sb->s_umount);
+
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_EXCL) {
+ if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(!freeze_owner))
+ return false;
+ if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0))
+ return false;
+ /* This isn't exclusively frozen. */
+ if (!sb->s_writers.freeze_owner)
+ return false;
+ /* This isn't exclusively frozen by us. */
+ if (sb->s_writers.freeze_owner != freeze_owner)
+ return false;
+ /*
+ * This is still frozen multiple times so we're just
+ * going to drop our reference count and undo our
+ * exclusive freeze.
+ */
+ if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1)
+ sb->s_writers.freeze_owner = NULL;
+ return true;
+ }
+
+ if (who & FREEZE_HOLDER_KERNEL) {
+ /*
+ * Someone's trying to steal the reference belonging to
+ * @sb->s_writers.freeze_owner.
+ */
+ if (sb->s_writers.freeze_kcount == 1 &&
+ sb->s_writers.freeze_owner)
+ return false;
+ return sb->s_writers.freeze_kcount > 0;
+ }
+
+ if (who & FREEZE_HOLDER_USERSPACE)
+ return sb->s_writers.freeze_ucount > 0;
+
+ return false;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
* @who: context that wants to freeze
+ * @freeze_owner: owner of the freeze
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
* freeze_fs. Subsequent calls to this without first thawing the fs may return
@@ -1998,7 +2122,7 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
* Return: If the freeze was successful zero is returned. If the freeze
* failed a negative error code is returned.
*/
-int freeze_super(struct super_block *sb, enum freeze_holder who)
+int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner)
{
int ret;
@@ -2010,7 +2134,7 @@ int freeze_super(struct super_block *sb, enum freeze_holder who)
retry:
if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (may_freeze(sb, who))
+ if (may_freeze(sb, who, freeze_owner))
ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
else
ret = -EBUSY;
@@ -2032,6 +2156,7 @@ retry:
if (sb_rdonly(sb)) {
/* Nothing to do really... */
WARN_ON_ONCE(freeze_inc(sb, who) > 1);
+ sb->s_writers.freeze_owner = freeze_owner;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
super_unlock_excl(sb);
@@ -2079,6 +2204,7 @@ retry:
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
WARN_ON_ONCE(freeze_inc(sb, who) > 1);
+ sb->s_writers.freeze_owner = freeze_owner;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
@@ -2093,13 +2219,17 @@ EXPORT_SYMBOL(freeze_super);
* removes that state without releasing the other state or unlocking the
* filesystem.
*/
-static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
int error = -EINVAL;
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
goto out_unlock;
+ if (!may_unfreeze(sb, who, freeze_owner))
+ goto out_unlock;
+
/*
* All freezers share a single active reference.
* So just unlock in case there are any left.
@@ -2109,6 +2239,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
if (sb_rdonly(sb)) {
sb->s_writers.frozen = SB_UNFROZEN;
+ sb->s_writers.freeze_owner = NULL;
wake_up_var(&sb->s_writers.frozen);
goto out_deactivate;
}
@@ -2126,6 +2257,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
}
sb->s_writers.frozen = SB_UNFROZEN;
+ sb->s_writers.freeze_owner = NULL;
wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
out_deactivate:
@@ -2141,6 +2273,7 @@ out_unlock:
* thaw_super -- unlock filesystem
* @sb: the super to thaw
* @who: context that wants to freeze
+ * @freeze_owner: owner of the freeze
*
* Unlocks the filesystem and marks it writeable again after freeze_super()
* if there are no remaining freezes on the filesystem.
@@ -2154,13 +2287,14 @@ out_unlock:
* have been frozen through the block layer via multiple block devices.
* The filesystem remains frozen until all block devices are unfrozen.
*/
-int thaw_super(struct super_block *sb, enum freeze_holder who)
+int thaw_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
if (!super_lock_excl(sb)) {
WARN_ON_ONCE("Dying superblock while thawing!");
return -EINVAL;
}
- return thaw_super_locked(sb, who);
+ return thaw_super_locked(sb, who, freeze_owner);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cb1af30b49f5..a3fd3cc591bd 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -555,7 +555,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
if (unlikely(IS_DEADDIR(d_inode(parent))))
dentry = ERR_PTR(-ENOENT);
else
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (!IS_ERR(dentry) && d_inode(dentry)) {
dput(dentry);
dentry = ERR_PTR(-EEXIST);
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index b780deb81b02..b492794f8e9a 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -262,40 +262,42 @@ static struct vboxsf_handle *vboxsf_get_write_handle(struct vboxsf_inode *sf_i)
return sf_handle;
}
-static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
+static int vboxsf_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
+ struct folio *folio = NULL;
struct vboxsf_inode *sf_i = VBOXSF_I(inode);
struct vboxsf_handle *sf_handle;
- loff_t off = page_offset(page);
loff_t size = i_size_read(inode);
- u32 nwrite = PAGE_SIZE;
- u8 *buf;
- int err;
-
- if (off + PAGE_SIZE > size)
- nwrite = size & ~PAGE_MASK;
+ int error;
sf_handle = vboxsf_get_write_handle(sf_i);
if (!sf_handle)
return -EBADF;
- buf = kmap(page);
- err = vboxsf_write(sf_handle->root, sf_handle->handle,
- off, &nwrite, buf);
- kunmap(page);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ loff_t off = folio_pos(folio);
+ u32 nwrite = folio_size(folio);
+ u8 *buf;
- kref_put(&sf_handle->refcount, vboxsf_handle_release);
+ if (nwrite > size - off)
+ nwrite = size - off;
- if (err == 0) {
- /* mtime changed */
- sf_i->force_restat = 1;
- } else {
- ClearPageUptodate(page);
+ buf = kmap_local_folio(folio, 0);
+ error = vboxsf_write(sf_handle->root, sf_handle->handle,
+ off, &nwrite, buf);
+ kunmap_local(buf);
+
+ folio_unlock(folio);
}
- unlock_page(page);
- return err;
+ kref_put(&sf_handle->refcount, vboxsf_handle_release);
+
+ /* mtime changed */
+ if (error == 0)
+ sf_i->force_restat = 1;
+ return error;
}
static int vboxsf_write_end(struct file *file, struct address_space *mapping,
@@ -347,10 +349,11 @@ out:
*/
const struct address_space_operations vboxsf_reg_aops = {
.read_folio = vboxsf_read_folio,
- .writepage = vboxsf_writepage,
+ .writepages = vboxsf_writepages,
.dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
.write_end = vboxsf_write_end,
+ .migrate_folio = filemap_migrate_folio,
};
static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode,
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index e629663e460a..9b598c5790ad 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -123,7 +123,7 @@ xchk_fsfreeze(
{
int error;
- error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsfreeze(sc, error);
return error;
}
@@ -135,7 +135,7 @@ xchk_fsthaw(
int error;
/* This should always succeed, we have a kernel freeze */
- error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsthaw(sc, error);
return error;
}
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 3537f3cca6d5..9c12cb844231 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -153,8 +153,7 @@ xrep_orphanage_create(
/* Try to find the orphanage directory. */
inode_lock_nested(root_inode, I_MUTEX_PARENT);
- orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry,
- strlen(ORPHANAGE));
+ orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
if (IS_ERR(orphanage_dentry)) {
error = PTR_ERR(orphanage_dentry);
goto out_unlock_root;
@@ -445,7 +444,7 @@ xrep_adoption_check_dcache(
if (!d_orphanage)
return 0;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
if (d_child) {
trace_xrep_adoption_check_child(sc->mp, d_child);
@@ -482,7 +481,7 @@ xrep_adoption_zap_dcache(
if (!d_orphanage)
return;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
while (d_child != NULL) {
trace_xrep_adoption_invalidate_child(sc->mp, d_child);
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index fe21c76f75b8..2a736d10eafb 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -18,42 +18,36 @@ xfs_rw_bdev(
enum req_op op)
{
- unsigned int is_vmalloc = is_vmalloc_addr(data);
- unsigned int left = count;
+ unsigned int done = 0, added;
int error;
struct bio *bio;
- if (is_vmalloc && op == REQ_OP_WRITE)
- flush_kernel_vmap_range(data, count);
+ op |= REQ_META | REQ_SYNC;
+ if (!is_vmalloc_addr(data))
+ return bdev_rw_virt(bdev, sector, data, count, op);
- bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
- GFP_KERNEL);
+ bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
-
- while (bio_add_page(bio, page, len, off) != len) {
+ added = bio_add_vmalloc_chunk(bio, data + done, count - done);
+ if (!added) {
struct bio *prev = bio;
- bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ bio = bio_alloc(prev->bi_bdev,
+ bio_max_vecs(count - done),
prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_chain(prev, bio);
-
submit_bio(prev);
}
-
- data += len;
- left -= len;
- } while (left > 0);
+ done += added;
+ } while (done < count);
error = submit_bio_wait(bio);
bio_put(bio);
- if (is_vmalloc && op == REQ_OP_READ)
+ if (op == REQ_OP_READ)
invalidate_kernel_vmap_range(data, count);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1a2b3f06fa71..f2d00774a84f 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1333,45 +1333,18 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len);
unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
- if (is_vmalloc_addr(bp->b_addr)) {
- unsigned int size = BBTOB(bp->b_length);
- unsigned int alloc_size = roundup(size, PAGE_SIZE);
- void *data = bp->b_addr;
-
- bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
- xfs_buf_bio_op(bp), GFP_NOIO);
-
- do {
- unsigned int len = min(size, PAGE_SIZE);
-
- ASSERT(offset_in_page(data) == 0);
- __bio_add_page(bio, vmalloc_to_page(data), len, 0);
- data += len;
- size -= len;
- } while (size);
-
- flush_kernel_vmap_range(bp->b_addr, alloc_size);
- } else {
- /*
- * Single folio or slab allocation. Must be contiguous and thus
- * only a single bvec is needed.
- *
- * This uses the page based bio add helper for now as that is
- * the lowest common denominator between folios and slab
- * allocations. To be replaced with a better block layer
- * helper soon (hopefully).
- */
- bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
- GFP_NOIO);
- __bio_add_page(bio, virt_to_page(bp->b_addr),
- BBTOB(bp->b_length),
- offset_in_page(bp->b_addr));
- }
-
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
+ GFP_NOIO);
+ if (is_vmalloc_addr(bp->b_addr))
+ bio_add_vmalloc(bio, bp->b_addr, len);
+ else
+ bio_add_virt_nofail(bio, bp->b_addr, len);
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 980aabc49512..793468b4d30d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1607,27 +1607,6 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
-static int
-xlog_map_iclog_data(
- struct bio *bio,
- void *data,
- size_t count)
-{
- do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- size_t len = min_t(size_t, count, PAGE_SIZE - off);
-
- if (bio_add_page(bio, page, len, off) != len)
- return -EIO;
-
- data += len;
- count -= len;
- } while (count);
-
- return 0;
-}
-
STATIC void
xlog_write_iclog(
struct xlog *log,
@@ -1693,11 +1672,12 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
- goto shutdown;
-
- if (is_vmalloc_addr(iclog->ic_data))
- flush_kernel_vmap_range(iclog->ic_data, count);
+ if (is_vmalloc_addr(iclog->ic_data)) {
+ if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+ } else {
+ bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count);
+ }
/*
* If this log buffer would straddle the end of the log we will have
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index ed8d8ed42f0a..3545dc1d953c 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -127,7 +127,7 @@ xfs_dax_notify_failure_freeze(
struct super_block *sb = mp->m_super;
int error;
- error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "already frozen by kernel, err=%d", error);
@@ -143,7 +143,7 @@ xfs_dax_notify_failure_thaw(
int error;
if (kernel_frozen) {
- error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "still frozen after notify failure, err=%d",
error);
@@ -153,7 +153,7 @@ xfs_dax_notify_failure_thaw(
* Also thaw userspace call anyway because the device is about to be
* removed immediately.
*/
- thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b2dd0c0bf509..4a11ddccc563 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1149,7 +1149,7 @@ xfs_init_percpu_counters(
return 0;
free_freecounters:
- while (--i > 0)
+ while (--i >= 0)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
@@ -2114,6 +2114,21 @@ xfs_fs_reconfigure(
if (error)
return error;
+ /* attr2 -> noattr2 */
+ if (xfs_has_noattr2(new_mp)) {
+ if (xfs_has_crc(mp)) {
+ xfs_warn(mp,
+ "attr2 is always enabled for a V5 filesystem - can't be changed.");
+ return -EINVAL;
+ }
+ mp->m_features &= ~XFS_FEAT_ATTR2;
+ mp->m_features |= XFS_FEAT_NOATTR2;
+ } else if (xfs_has_attr2(new_mp)) {
+ /* noattr2 -> attr2 */
+ mp->m_features &= ~XFS_FEAT_NOATTR2;
+ mp->m_features |= XFS_FEAT_ATTR2;
+ }
+
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
@@ -2126,6 +2141,17 @@ xfs_fs_reconfigure(
mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
+ /*
+ * Now that mp has been modified according to the remount options, we
+ * do a final option validation with xfs_finish_flags() just like it is
+ * just like it is done during mount. We cannot use
+ * done during mount. We cannot use xfs_finish_flags() on new_mp as it
+ * contains only the user given options.
+ */
+ error = xfs_finish_flags(mp);
+ if (error)
+ return error;
+
/* ro -> rw */
if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
error = xfs_remount_rw(mp);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 85a649fec6ac..67c328d23e4a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -315,7 +315,7 @@ xfs_ail_splice(
}
/*
- * Delete the given item from the AIL. Return a pointer to the item.
+ * Delete the given item from the AIL.
*/
static void
xfs_ail_delete(
@@ -777,26 +777,28 @@ xfs_ail_update_finish(
}
/*
- * xfs_trans_ail_update - bulk AIL insertion operation.
+ * xfs_trans_ail_update_bulk - bulk AIL insertion operation.
*
- * @xfs_trans_ail_update takes an array of log items that all need to be
+ * @xfs_trans_ail_update_bulk takes an array of log items that all need to be
* positioned at the same LSN in the AIL. If an item is not in the AIL, it will
- * be added. Otherwise, it will be repositioned by removing it and re-adding
- * it to the AIL. If we move the first item in the AIL, update the log tail to
- * match the new minimum LSN in the AIL.
+ * be added. Otherwise, it will be repositioned by removing it and re-adding
+ * it to the AIL.
*
- * This function takes the AIL lock once to execute the update operations on
- * all the items in the array, and as such should not be called with the AIL
- * lock held. As a result, once we have the AIL lock, we need to check each log
- * item LSN to confirm it needs to be moved forward in the AIL.
+ * If we move the first item in the AIL, update the log tail to match the new
+ * minimum LSN in the AIL.
*
- * To optimise the insert operation, we delete all the items from the AIL in
- * the first pass, moving them into a temporary list, then splice the temporary
- * list into the correct position in the AIL. This avoids needing to do an
- * insert operation on every item.
+ * This function should be called with the AIL lock held.
*
- * This function must be called with the AIL lock held. The lock is dropped
- * before returning.
+ * To optimise the insert operation, we add all items to a temporary list, then
+ * splice this list into the correct position in the AIL.
+ *
+ * Items that are already in the AIL are first deleted from their current
+ * location before being added to the temporary list.
+ *
+ * This avoids needing to do an insert operation on every item.
+ *
+ * The AIL lock is dropped by xfs_ail_update_finish() before returning to
+ * the caller.
*/
void
xfs_trans_ail_update_bulk(
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index 81c94dd1d596..d613a4094db6 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -807,7 +807,8 @@ xfs_zone_gc_write_chunk(
{
struct xfs_zone_gc_data *data = chunk->data;
struct xfs_mount *mp = chunk->ip->i_mount;
- unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
+ phys_addr_t bvec_paddr =
+ bvec_phys(bio_first_bvec_all(&chunk->bio));
struct xfs_gc_bio *split_chunk;
if (chunk->bio.bi_status)
@@ -822,7 +823,7 @@ xfs_zone_gc_write_chunk(
bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
- folio_offset);
+ offset_in_folio(chunk->scratch->folio, bvec_paddr));
while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
xfs_zone_gc_submit_write(data, split_chunk);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index faf1eb87895d..d165eb979f21 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1111,28 +1111,19 @@ static int zonefs_read_super(struct super_block *sb)
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_super *super;
u32 crc, stored_crc;
- struct page *page;
- struct bio_vec bio_vec;
- struct bio bio;
int ret;
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ super = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!super)
return -ENOMEM;
- bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = 0;
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- ret = submit_bio_wait(&bio);
+ ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ);
if (ret)
- goto free_page;
-
- super = page_address(page);
+ goto free_super;
ret = -EINVAL;
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
- goto free_page;
+ goto free_super;
stored_crc = le32_to_cpu(super->s_crc);
super->s_crc = 0;
@@ -1140,14 +1131,14 @@ static int zonefs_read_super(struct super_block *sb)
if (crc != stored_crc) {
zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
crc, stored_crc);
- goto free_page;
+ goto free_super;
}
sbi->s_features = le64_to_cpu(super->s_features);
if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
zonefs_err(sb, "Unknown features set 0x%llx\n",
sbi->s_features);
- goto free_page;
+ goto free_super;
}
if (sbi->s_features & ZONEFS_F_UID) {
@@ -1155,7 +1146,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_uid));
if (!uid_valid(sbi->s_uid)) {
zonefs_err(sb, "Invalid UID feature\n");
- goto free_page;
+ goto free_super;
}
}
@@ -1164,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_gid));
if (!gid_valid(sbi->s_gid)) {
zonefs_err(sb, "Invalid GID feature\n");
- goto free_page;
+ goto free_super;
}
}
@@ -1173,15 +1164,14 @@ static int zonefs_read_super(struct super_block *sb)
if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
zonefs_err(sb, "Reserved area is being used\n");
- goto free_page;
+ goto free_super;
}
import_uuid(&sbi->s_uuid, super->s_uuid);
ret = 0;
-free_page:
- __free_page(page);
-
+free_super:
+ kfree(super);
return ret;
}
diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h
index df120b4d1f83..eaf704d3d05e 100644
--- a/include/drm/drm_gpusvm.h
+++ b/include/drm/drm_gpusvm.h
@@ -89,6 +89,7 @@ struct drm_gpusvm_devmem_ops {
* @ops: Pointer to the operations structure for GPU SVM device memory
* @dpagemap: The struct drm_pagemap of the pages this allocation belongs to.
* @size: Size of device memory allocation
+ * @timeslice_expiration: Timeslice expiration in jiffies
*/
struct drm_gpusvm_devmem {
struct device *dev;
@@ -97,6 +98,7 @@ struct drm_gpusvm_devmem {
const struct drm_gpusvm_devmem_ops *ops;
struct drm_pagemap *dpagemap;
size_t size;
+ u64 timeslice_expiration;
};
/**
@@ -186,6 +188,31 @@ struct drm_gpusvm_notifier {
};
/**
+ * struct drm_gpusvm_range_flags - Structure representing a GPU SVM range flags
+ *
+ * @migrate_devmem: Flag indicating whether the range can be migrated to device memory
+ * @unmapped: Flag indicating if the range has been unmapped
+ * @partial_unmap: Flag indicating if the range has been partially unmapped
+ * @has_devmem_pages: Flag indicating if the range has devmem pages
+ * @has_dma_mapping: Flag indicating if the range has a DMA mapping
+ * @__flags: Flags for range in u16 form (used for READ_ONCE)
+ */
+struct drm_gpusvm_range_flags {
+ union {
+ struct {
+ /* All flags below must be set upon creation */
+ u16 migrate_devmem : 1;
+ /* All flags below must be set / cleared under notifier lock */
+ u16 unmapped : 1;
+ u16 partial_unmap : 1;
+ u16 has_devmem_pages : 1;
+ u16 has_dma_mapping : 1;
+ };
+ u16 __flags;
+ };
+};
+
+/**
* struct drm_gpusvm_range - Structure representing a GPU SVM range
*
* @gpusvm: Pointer to the GPU SVM structure
@@ -198,11 +225,6 @@ struct drm_gpusvm_notifier {
* @dpagemap: The struct drm_pagemap of the device pages we're dma-mapping.
* Note this is assuming only one drm_pagemap per range is allowed.
* @flags: Flags for range
- * @flags.migrate_devmem: Flag indicating whether the range can be migrated to device memory
- * @flags.unmapped: Flag indicating if the range has been unmapped
- * @flags.partial_unmap: Flag indicating if the range has been partially unmapped
- * @flags.has_devmem_pages: Flag indicating if the range has devmem pages
- * @flags.has_dma_mapping: Flag indicating if the range has a DMA mapping
*
* This structure represents a GPU SVM range used for tracking memory ranges
* mapped in a DRM device.
@@ -216,15 +238,7 @@ struct drm_gpusvm_range {
unsigned long notifier_seq;
struct drm_pagemap_device_addr *dma_addr;
struct drm_pagemap *dpagemap;
- struct {
- /* All flags below must be set upon creation */
- u16 migrate_devmem : 1;
- /* All flags below must be set / cleared under notifier lock */
- u16 unmapped : 1;
- u16 partial_unmap : 1;
- u16 has_devmem_pages : 1;
- u16 has_dma_mapping : 1;
- } flags;
+ struct drm_gpusvm_range_flags flags;
};
/**
@@ -283,17 +297,22 @@ struct drm_gpusvm {
* @check_pages_threshold: Check CPU pages for present if chunk is less than or
* equal to threshold. If not present, reduce chunk
* size.
+ * @timeslice_ms: The timeslice MS which in minimum time a piece of memory
+ * remains with either exclusive GPU or CPU access.
* @in_notifier: entering from a MMU notifier
* @read_only: operating on read-only memory
* @devmem_possible: possible to use device memory
+ * @devmem_only: use only device memory
*
* Context that is DRM GPUSVM is operating in (i.e. user arguments).
*/
struct drm_gpusvm_ctx {
unsigned long check_pages_threshold;
+ unsigned long timeslice_ms;
unsigned int in_notifier :1;
unsigned int read_only :1;
unsigned int devmem_possible :1;
+ unsigned int devmem_only :1;
};
int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index d212848d07f3..a7ce9523c50d 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -861,6 +861,10 @@
MACRO__(0xB081, ## __VA_ARGS__), \
MACRO__(0xB082, ## __VA_ARGS__), \
MACRO__(0xB083, ## __VA_ARGS__), \
+ MACRO__(0xB084, ## __VA_ARGS__), \
+ MACRO__(0xB085, ## __VA_ARGS__), \
+ MACRO__(0xB086, ## __VA_ARGS__), \
+ MACRO__(0xB087, ## __VA_ARGS__), \
MACRO__(0xB08F, ## __VA_ARGS__), \
MACRO__(0xB090, ## __VA_ARGS__), \
MACRO__(0xB0A0, ## __VA_ARGS__), \
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index a946e0203e6d..8f7931eb7d16 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -104,6 +104,16 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
#else /* ARCH_NEEDS_WEAK_PER_CPU */
+#ifdef MODULE
+
+#define DEFINE_ALLOC_TAG(_alloc_tag) \
+ static struct alloc_tag _alloc_tag __used __aligned(8) \
+ __section(ALLOC_TAG_SECTION_NAME) = { \
+ .ct = CODE_TAG_INIT, \
+ .counters = NULL };
+
+#else /* MODULE */
+
#define DEFINE_ALLOC_TAG(_alloc_tag) \
static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \
static struct alloc_tag _alloc_tag __used __aligned(8) \
@@ -111,6 +121,8 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
.ct = CODE_TAG_INIT, \
.counters = &_alloc_tag_cntr };
+#endif /* MODULE */
+
#endif /* ARCH_NEEDS_WEAK_PER_CPU */
DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1625c8529e70..65abd5ab8836 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,7 +90,6 @@ struct linux_binfmt {
struct list_head lh;
struct module *module;
int (*load_binary)(struct linux_binprm *);
- int (*load_shlib)(struct file *);
#ifdef CONFIG_COREDUMP
int (*core_dump)(struct coredump_params *cprm);
unsigned long min_coredump; /* minimal dump size */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index cafc7c215de8..9c37c66ef9ca 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -11,6 +11,7 @@
#include <linux/uio.h>
#define BIO_MAX_VECS 256U
+#define BIO_MAX_INLINE_VECS UIO_MAXIOV
struct queue_limits;
@@ -402,7 +403,6 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
struct request_queue;
-extern int submit_bio_wait(struct bio *bio);
void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
unsigned short max_vecs, blk_opf_t opf);
extern void bio_uninit(struct bio *);
@@ -417,6 +417,30 @@ void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
size_t off);
+void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len);
+
+/**
+ * bio_add_max_vecs - number of bio_vecs needed to add data to a bio
+ * @kaddr: kernel virtual address to add
+ * @len: length in bytes to add
+ *
+ * Calculate how many bio_vecs need to be allocated to add the kernel virtual
+ * address range in [@kaddr:@len] in the worse case.
+ */
+static inline unsigned int bio_add_max_vecs(void *kaddr, unsigned int len)
+{
+ if (is_vmalloc_addr(kaddr))
+ return DIV_ROUND_UP(offset_in_page(kaddr) + len, PAGE_SIZE);
+ return 1;
+}
+
+unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len);
+bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len);
+
+int submit_bio_wait(struct bio *bio);
+int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
+ size_t len, enum req_op op);
+
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8eb9b3310167..de8c85a03bb7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -9,6 +9,7 @@
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>
+#include <linux/rwsem.h>
struct blk_mq_tags;
struct blk_flush_queue;
@@ -506,6 +507,9 @@ enum hctx_type {
* request_queue.tag_set_list.
* @srcu: Use as lock when type of the request queue is blocking
* (BLK_MQ_F_BLOCKING).
+ * @update_nr_hwq_lock:
+ * Synchronize updating nr_hw_queues with add/del disk &
+ * switching elevator.
*/
struct blk_mq_tag_set {
const struct blk_mq_ops *ops;
@@ -527,6 +531,8 @@ struct blk_mq_tag_set {
struct mutex tag_list_lock;
struct list_head tag_list;
struct srcu_struct *srcu;
+
+ struct rw_semaphore update_nr_hwq_lock;
};
/**
@@ -1031,8 +1037,8 @@ int blk_rq_map_user_io(struct request *, struct rq_map_data *,
int blk_rq_map_user_iov(struct request_queue *, struct request *,
struct rq_map_data *, const struct iov_iter *, gfp_t);
int blk_rq_unmap_user(struct bio *);
-int blk_rq_map_kern(struct request_queue *, struct request *, void *,
- unsigned int, gfp_t);
+int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len,
+ gfp_t gfp);
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dce7615c35e7..3d1577f07c1c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -220,6 +220,7 @@ struct bio {
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
enum rw_hint bi_write_hint;
+ u8 bi_write_stream;
blk_status_t bi_status;
atomic_t __bi_remaining;
@@ -286,7 +287,6 @@ struct bio {
enum {
BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */
BIO_CLONED, /* doesn't own data */
- BIO_BOUNCED, /* bio is a bounce bio */
BIO_QUIET, /* Make BIO Quiet */
BIO_CHAIN, /* chained bio, ->bi_remaining in effect */
BIO_REFFED, /* bio has elevated ->bi_cnt */
@@ -296,6 +296,14 @@ enum {
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
+ /*
+ * This bio has completed bps throttling at the single tg granularity,
+ * which is different from BIO_BPS_THROTTLED. When the bio is enqueued
+ * into the sq->queued of the upper tg, or is about to be dispatched,
+ * this flag needs to be cleared. Since blk-throttle and rq_qos are not
+ * on the same hierarchical level, reuse the value.
+ */
+ BIO_TG_BPS_THROTTLED = BIO_QOS_THROTTLED,
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9a1f0ee40b56..332b56f323d9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -182,7 +182,6 @@ struct gendisk {
struct list_head slave_bdevs;
#endif
struct timer_rand_state *random;
- atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_ZONED
@@ -218,6 +217,8 @@ struct gendisk {
* devices that do not have multiple independent access ranges.
*/
struct blk_independent_access_ranges *ia_ranges;
+
+ struct mutex rqos_state_mutex; /* rqos state change mutex */
};
/**
@@ -331,9 +332,6 @@ typedef unsigned int __bitwise blk_features_t;
/* skip this queue in blk_mq_(un)quiesce_tagset */
#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13))
-/* bounce all highmem pages */
-#define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14))
-
/* undocumented magic for bcache */
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
((__force blk_features_t)(1u << 15))
@@ -347,7 +345,7 @@ typedef unsigned int __bitwise blk_features_t;
*/
#define BLK_FEAT_INHERIT_MASK \
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
- BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \
+ BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | \
BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE)
/* internal flags in queue_limits.flags */
@@ -405,6 +403,9 @@ struct queue_limits {
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
+ unsigned short max_write_streams;
+ unsigned int write_stream_granularity;
+
unsigned int max_open_zones;
unsigned int max_active_zones;
@@ -644,6 +645,8 @@ enum {
QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */
QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */
QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */
+ QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */
+ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */
QUEUE_FLAG_MAX
};
@@ -679,6 +682,10 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
#define blk_queue_skip_tagset_quiesce(q) \
((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
+#define blk_queue_disable_wbt(q) \
+ test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
+#define blk_queue_no_elv_switch(q) \
+ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -1288,6 +1295,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
return queue_max_segments(bdev_get_queue(bdev));
}
+static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
+{
+ if (bdev_is_partition(bdev))
+ return 0;
+ return bdev_limits(bdev)->max_write_streams;
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
return q->limits.logical_block_size;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e7da3c3b098b..166d6de50dbf 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -785,6 +785,17 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+static inline void get_cgroup_ns(struct cgroup_namespace *ns)
+{
+ refcount_inc(&ns->ns.count);
+}
+
+static inline void put_cgroup_ns(struct cgroup_namespace *ns)
+{
+ if (refcount_dec_and_test(&ns->ns.count))
+ free_cgroup_ns(ns);
+}
+
#else /* !CONFIG_CGROUPS */
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
@@ -795,19 +806,10 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
return old_ns;
}
-#endif /* !CONFIG_CGROUPS */
+static inline void get_cgroup_ns(struct cgroup_namespace *ns) { }
+static inline void put_cgroup_ns(struct cgroup_namespace *ns) { }
-static inline void get_cgroup_ns(struct cgroup_namespace *ns)
-{
- if (ns)
- refcount_inc(&ns->ns.count);
-}
-
-static inline void put_cgroup_ns(struct cgroup_namespace *ns)
-{
- if (ns && refcount_dec_and_test(&ns->ns.count))
- free_cgroup_ns(ns);
-}
+#endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUPS
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index d14dbd26b370..0ee4c21c6dbc 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -36,10 +36,10 @@ union codetag_ref {
struct codetag_type_desc {
const char *section;
size_t tag_size;
- void (*module_load)(struct codetag_type *cttype,
- struct codetag_module *cmod);
- void (*module_unload)(struct codetag_type *cttype,
- struct codetag_module *cmod);
+ void (*module_load)(struct module *mod,
+ struct codetag *start, struct codetag *end);
+ void (*module_unload)(struct module *mod,
+ struct codetag *start, struct codetag *end);
#ifdef CONFIG_MODULES
void (*module_replaced)(struct module *mod, struct module *new_mod);
bool (*needs_section_mem)(struct module *mod, unsigned long size);
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 77e6e195d1d6..76e41805b92d 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -28,6 +28,7 @@ struct coredump_params {
int vma_count;
size_t vma_data_size;
struct core_vma_metadata *vma_meta;
+ struct pid *pid;
};
extern unsigned int core_file_note_size_limit;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index e9f07e37dd6f..e29823c701ac 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -57,7 +57,8 @@ struct qstr {
};
#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
-#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))
+#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l)
+#define QSTR(n) QSTR_LEN(n, strlen(n))
extern const struct qstr empty_name;
extern const struct qstr slash_name;
@@ -281,7 +282,6 @@ extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);
extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
-extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
static inline unsigned d_count(const struct dentry *dentry)
{
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index d02f32b7514e..0864773a57e8 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -18,15 +18,16 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
short type, access = 0;
+ if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)))
+ return 0;
+
if (likely(!inode->i_rdev))
return 0;
if (S_ISBLK(inode->i_mode))
type = DEVCG_DEV_BLOCK;
- else if (S_ISCHR(inode->i_mode))
+ else /* S_ISCHR by the test above */
type = DEVCG_DEV_CHAR;
- else
- return 0;
if (mask & MAY_WRITE)
access |= DEVCG_ACC_WRITE;
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index f632ecfb4238..06c4de602b2f 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -11,6 +11,7 @@
#ifndef LINUX_DMAPOOL_H
#define LINUX_DMAPOOL_H
+#include <linux/nodemask_types.h>
#include <linux/scatterlist.h>
#include <asm/io.h>
@@ -18,8 +19,8 @@ struct device;
#ifdef CONFIG_HAS_DMA
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t allocation);
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node);
void dma_pool_destroy(struct dma_pool *pool);
@@ -35,9 +36,12 @@ struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
void dmam_pool_destroy(struct dma_pool *pool);
#else /* !CONFIG_HAS_DMA */
-static inline struct dma_pool *dma_pool_create(const char *name,
- struct device *dev, size_t size, size_t align, size_t allocation)
-{ return NULL; }
+static inline struct dma_pool *dma_pool_create_node(const char *name,
+ struct device *dev, size_t size, size_t align, size_t boundary,
+ int node)
+{
+ return NULL;
+}
static inline void dma_pool_destroy(struct dma_pool *pool) { }
static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle) { return NULL; }
@@ -49,6 +53,13 @@ static inline struct dma_pool *dmam_pool_create(const char *name,
static inline void dmam_pool_destroy(struct dma_pool *pool) { }
#endif /* !CONFIG_HAS_DMA */
+static inline struct dma_pool *dma_pool_create(const char *name,
+ struct device *dev, size_t size, size_t align, size_t boundary)
+{
+ return dma_pool_create_node(name, dev, size, align, boundary,
+ NUMA_NO_NODE);
+}
+
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle)
{
diff --git a/include/linux/file.h b/include/linux/file.h
index 302f11355b10..af1768d934a0 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -59,7 +59,7 @@ static inline struct fd CLONED_FD(struct file *f)
static inline void fdput(struct fd fd)
{
- if (fd.word & FDPUT_FPUT)
+ if (unlikely(fd.word & FDPUT_FPUT))
fput(fd_file(fd));
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 016b0fe1536e..b807045614c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -408,6 +408,7 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
+ u8 ki_write_stream;
union {
/*
* Only used for async buffered reads, where it denotes the
@@ -433,7 +434,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
}
struct address_space_operations {
- int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*read_folio)(struct file *, struct folio *);
/* Write back some dirty pages from this mapping. */
@@ -867,6 +867,11 @@ static inline void inode_lock(struct inode *inode)
down_write(&inode->i_rwsem);
}
+static inline __must_check int inode_lock_killable(struct inode *inode)
+{
+ return down_write_killable(&inode->i_rwsem);
+}
+
static inline void inode_unlock(struct inode *inode)
{
up_write(&inode->i_rwsem);
@@ -877,6 +882,11 @@ static inline void inode_lock_shared(struct inode *inode)
down_read(&inode->i_rwsem);
}
+static inline __must_check int inode_lock_shared_killable(struct inode *inode)
+{
+ return down_read_killable(&inode->i_rwsem);
+}
+
static inline void inode_unlock_shared(struct inode *inode)
{
up_read(&inode->i_rwsem);
@@ -1307,6 +1317,7 @@ struct sb_writers {
unsigned short frozen; /* Is sb frozen? */
int freeze_kcount; /* How many kernel freeze requests? */
int freeze_ucount; /* How many userspace freeze requests? */
+ const void *freeze_owner; /* Owner of the freeze */
struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
};
@@ -1780,7 +1791,7 @@ static inline void __sb_end_write(struct super_block *sb, int level)
static inline void __sb_start_write(struct super_block *sb, int level)
{
- percpu_down_read(sb->s_writers.rw_sem + level - 1);
+ percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
}
static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
@@ -2071,8 +2082,18 @@ typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
struct dir_context {
filldir_t actor;
loff_t pos;
+ /*
+ * Filesystems MUST NOT MODIFY count, but may use as a hint:
+ * 0 unknown
+ * > 0 space in buffer (assume at least one entry)
+ * INT_MAX unlimited
+ */
+ int count;
};
+/* If OR-ed with d_type, pending signals are not checked */
+#define FILLDIR_FLAG_NOINTR 0x1000
+
/*
* These flags let !MMU mmap() govern direct device mapping vs immediate
* copying more easily for MAP_PRIVATE, especially for ROM filesystems.
@@ -2186,7 +2207,7 @@ struct file_operations {
/* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
-#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7))
+#define FOP_DONTCACHE 0 /* ((__force fop_flags_t)(1 << 7)) */
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
@@ -2269,6 +2290,7 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
* @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
* @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
* @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ * @FREEZE_EXCL: a freeze that can only be undone by the owner
*
* Indicate who the owner of the freeze or thaw request is and whether
* the freeze needs to be exclusive or can nest.
@@ -2282,6 +2304,7 @@ enum freeze_holder {
FREEZE_HOLDER_KERNEL = (1U << 0),
FREEZE_HOLDER_USERSPACE = (1U << 1),
FREEZE_MAY_NEST = (1U << 2),
+ FREEZE_EXCL = (1U << 3),
};
struct super_operations {
@@ -2295,9 +2318,9 @@ struct super_operations {
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_super) (struct super_block *, enum freeze_holder who);
+ int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner);
int (*freeze_fs) (struct super_block *);
- int (*thaw_super) (struct super_block *, enum freeze_holder who);
+ int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
@@ -2344,6 +2367,7 @@ struct super_operations {
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */
+#define S_ANON_INODE (1 << 19) /* Inode is an anonymous inode */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2400,6 +2424,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
+#define IS_ANON_FILE(inode) ((inode)->i_flags & S_ANON_INODE)
static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
struct inode *inode)
@@ -2705,8 +2730,10 @@ extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
-int freeze_super(struct super_block *super, enum freeze_holder who);
-int thaw_super(struct super_block *super, enum freeze_holder who);
+int freeze_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
+int thaw_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);
@@ -3515,9 +3542,11 @@ extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
-extern void iterate_supers(void (*)(struct super_block *, void *), void *);
+extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
extern void iterate_supers_type(struct file_system_type *,
void (*)(struct super_block *, void *), void *);
+void filesystems_freeze(void);
+void filesystems_thaw(void);
extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 53e566efd5fd..5a0e897cae80 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -87,14 +87,9 @@ extern int lookup_constant(const struct constant_table tbl[], const char *name,
extern const struct constant_table bool_names[];
#ifdef CONFIG_VALIDATE_FS_PARSER
-extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special);
extern bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc);
#else
-static inline bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special)
-{ return true; }
static inline bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)
{ return true; }
@@ -125,8 +120,6 @@ static inline bool fs_validate_description(const char *name,
#define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL)
#define fsparam_u32oct(NAME, OPT) \
__fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8)
-#define fsparam_u32hex(NAME, OPT) \
- __fsparam(fs_param_is_u32_hex, NAME, OPT, 0, (void *)16)
#define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL)
#define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL)
#define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 5c6bea81a90e..c698f8415675 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -461,7 +461,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
const char *from = kmap_local_folio(folio, offset);
size_t chunk = len;
- if (folio_test_highmem(folio) &&
+ if (folio_test_partial_kmap(folio) &&
chunk > PAGE_SIZE - offset_in_page(offset))
chunk = PAGE_SIZE - offset_in_page(offset);
memcpy(to, from, chunk);
@@ -489,7 +489,7 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset,
char *to = kmap_local_folio(folio, offset);
size_t chunk = len;
- if (folio_test_highmem(folio) &&
+ if (folio_test_partial_kmap(folio) &&
chunk > PAGE_SIZE - offset_in_page(offset))
chunk = PAGE_SIZE - offset_in_page(offset);
memcpy(to, from, chunk);
@@ -522,7 +522,7 @@ static inline __must_check void *folio_zero_tail(struct folio *folio,
{
size_t len = folio_size(folio) - offset;
- if (folio_test_highmem(folio)) {
+ if (folio_test_partial_kmap(folio)) {
size_t max = PAGE_SIZE - offset_in_page(offset);
while (len > max) {
@@ -560,7 +560,7 @@ static inline void folio_fill_tail(struct folio *folio, size_t offset,
VM_BUG_ON(offset + len > folio_size(folio));
- if (folio_test_highmem(folio)) {
+ if (folio_test_partial_kmap(folio)) {
size_t max = PAGE_SIZE - offset_in_page(offset);
while (len > max) {
@@ -597,7 +597,7 @@ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio,
size_t offset = offset_in_folio(folio, pos);
char *from = kmap_local_folio(folio, offset);
- if (folio_test_highmem(folio)) {
+ if (folio_test_partial_kmap(folio)) {
offset = offset_in_page(offset);
len = min_t(size_t, len, PAGE_SIZE - offset);
} else
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8f3ac832ee7f..4861a7e304bb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -275,6 +275,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
+void fixup_hugetlb_reservations(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -468,6 +469,10 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+}
+
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index d6ffe01962c2..b52ac40d5830 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1167,13 +1167,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel,
enum vmbus_packet_type type,
u32 flags);
-extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
- struct hv_page_buffer pagebuffers[],
- u32 pagecount,
- void *buffer,
- u32 bufferlen,
- u64 requestid);
-
extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
struct vmbus_packet_mpb_array *mpb,
u32 desc_size,
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 0634a3de1782..53408124c1e5 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -140,6 +140,15 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
return cmd_to_io_kiocb(cmd)->async_data;
}
+/*
+ * Return uring_cmd's context reference as its context handle for driver to
+ * track per-context resource, such as registered kernel IO buffer
+ */
+static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd)
+{
+ return cmd_to_io_kiocb(cmd)->ctx;
+}
+
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
void (*release)(void *), unsigned int index,
unsigned int issue_flags);
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b44d201520d8..2922635986f5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
IO_URING_F_TASK_DEAD = (1 << 13),
};
-struct io_zcrx_ifq;
-
struct io_wq_work_node {
struct io_wq_work_node *next;
};
@@ -343,7 +341,6 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
- unsigned cq_extra;
void *cq_wait_arg;
size_t cq_wait_size;
@@ -394,7 +391,8 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
- struct io_zcrx_ifq *ifq;
+ /* Stores zcrx object pointers of type struct io_zcrx_ifq */
+ struct xarray zcrx_ctxs;
u32 pers_next;
struct xarray personalities;
@@ -418,6 +416,7 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work;
struct list_head defer_list;
+ unsigned nr_drained;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
@@ -436,6 +435,7 @@ struct io_ring_ctx {
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
+ unsigned nr_req_allocated;
/*
* Protection for resize vs mmap races - both the mmap and resize
@@ -448,8 +448,6 @@ struct io_ring_ctx {
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
- /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
- struct io_mapped_region zcrx_region;
};
/*
@@ -653,8 +651,7 @@ struct io_kiocb {
u8 iopoll_completed;
/*
* Can be either a fixed buffer index, or used with provided buffers.
- * For the latter, before issue it points to the buffer group ID,
- * and after selection it points to the buffer ID itself.
+ * For the latter, it points to the selected buffer ID.
*/
u16 buf_index;
@@ -713,7 +710,7 @@ struct io_kiocb {
const struct cred *creds;
struct io_wq_work work;
- struct {
+ struct io_big_cqe {
u64 extra1;
u64 extra2;
} big_cqe;
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 591bf5b5e8dc..9af01bdd86d2 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -44,7 +44,6 @@
#define MICREL_PHY_50MHZ_CLK BIT(0)
#define MICREL_PHY_FXEN BIT(1)
#define MICREL_KSZ8_P1_ERRATA BIT(2)
-#define MICREL_NO_EEE BIT(3)
#define MICREL_KSZ9021_EXTREG_CTRL 0xB
#define MICREL_KSZ9021_EXTREG_DATA_WRITE 0xC
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf55206935c4..fdda6b16263b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -385,7 +385,7 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 38
+# define VM_UFFD_MINOR_BIT 41
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
diff --git a/include/linux/mman.h b/include/linux/mman.h
index bce214fece16..f4c6346a8fcd 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -155,7 +155,9 @@ calc_vm_flag_bits(struct file *file, unsigned long flags)
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
+#endif
arch_calc_vm_flag_bits(file, flags);
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ccec1bf2896..b1c459f7a485 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -148,7 +148,6 @@ enum zone_stat_item {
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
- NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dcc17ce8a959..6904ad33ee7a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -22,48 +22,51 @@ struct fs_context;
struct file;
struct path;
-#define MNT_NOSUID 0x01
-#define MNT_NODEV 0x02
-#define MNT_NOEXEC 0x04
-#define MNT_NOATIME 0x08
-#define MNT_NODIRATIME 0x10
-#define MNT_RELATIME 0x20
-#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
-#define MNT_NOSYMFOLLOW 0x80
-
-#define MNT_SHRINKABLE 0x100
-#define MNT_WRITE_HOLD 0x200
-
-#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
-#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
-/*
- * MNT_SHARED_MASK is the set of flags that should be cleared when a
- * mount becomes shared. Currently, this is only the flag that says a
- * mount cannot be bind mounted, since this is how we create a mount
- * that shares events with another mount. If you add a new MNT_*
- * flag, consider how it interacts with shared mounts.
- */
-#define MNT_SHARED_MASK (MNT_UNBINDABLE)
-#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
- | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
- | MNT_READONLY | MNT_NOSYMFOLLOW)
-#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
-
-#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
- MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
-
-#define MNT_INTERNAL 0x4000
-
-#define MNT_LOCK_ATIME 0x040000
-#define MNT_LOCK_NOEXEC 0x080000
-#define MNT_LOCK_NOSUID 0x100000
-#define MNT_LOCK_NODEV 0x200000
-#define MNT_LOCK_READONLY 0x400000
-#define MNT_LOCKED 0x800000
-#define MNT_DOOMED 0x1000000
-#define MNT_SYNC_UMOUNT 0x2000000
-#define MNT_MARKED 0x4000000
-#define MNT_UMOUNT 0x8000000
+enum mount_flags {
+ MNT_NOSUID = 0x01,
+ MNT_NODEV = 0x02,
+ MNT_NOEXEC = 0x04,
+ MNT_NOATIME = 0x08,
+ MNT_NODIRATIME = 0x10,
+ MNT_RELATIME = 0x20,
+ MNT_READONLY = 0x40, /* does the user want this to be r/o? */
+ MNT_NOSYMFOLLOW = 0x80,
+
+ MNT_SHRINKABLE = 0x100,
+ MNT_WRITE_HOLD = 0x200,
+
+ MNT_SHARED = 0x1000, /* if the vfsmount is a shared mount */
+ MNT_UNBINDABLE = 0x2000, /* if the vfsmount is a unbindable mount */
+
+ MNT_INTERNAL = 0x4000,
+
+ MNT_LOCK_ATIME = 0x040000,
+ MNT_LOCK_NOEXEC = 0x080000,
+ MNT_LOCK_NOSUID = 0x100000,
+ MNT_LOCK_NODEV = 0x200000,
+ MNT_LOCK_READONLY = 0x400000,
+ MNT_LOCKED = 0x800000,
+ MNT_DOOMED = 0x1000000,
+ MNT_SYNC_UMOUNT = 0x2000000,
+ MNT_MARKED = 0x4000000,
+ MNT_UMOUNT = 0x8000000,
+
+ /*
+ * MNT_SHARED_MASK is the set of flags that should be cleared when a
+ * mount becomes shared. Currently, this is only the flag that says a
+ * mount cannot be bind mounted, since this is how we create a mount
+ * that shares events with another mount. If you add a new MNT_*
+ * flag, consider how it interacts with shared mounts.
+ */
+ MNT_SHARED_MASK = MNT_UNBINDABLE,
+ MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC
+ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME
+ | MNT_READONLY | MNT_NOSYMFOLLOW,
+ MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
+
+ MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
+ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED,
+};
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 58a2401e4b55..0075f6e5c3da 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -262,6 +262,11 @@ struct mr_table {
int mroute_reg_vif_num;
};
+static inline bool mr_can_free_table(struct net *net)
+{
+ return !check_net(net) || !net_initialized(net);
+}
+
#ifdef CONFIG_IP_MROUTE_COMMON
void vif_device_init(struct vif_device *v,
struct net_device *dev,
diff --git a/include/linux/namei.h b/include/linux/namei.h
index bbaf55fb3101..5d085428e471 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -70,17 +70,16 @@ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
unsigned int, struct path *);
-extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
-extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
-extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
-extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
-struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int);
+extern struct dentry *try_lookup_noperm(struct qstr *, struct dentry *);
+extern struct dentry *lookup_noperm(struct qstr *, struct dentry *);
+extern struct dentry *lookup_noperm_unlocked(struct qstr *, struct dentry *);
+extern struct dentry *lookup_noperm_positive_unlocked(struct qstr *, struct dentry *);
+struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *);
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
- const char *name, struct dentry *base,
- int len);
+ struct qstr *name, struct dentry *base);
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
- const char *name,
- struct dentry *base, int len);
+ struct qstr *name,
+ struct dentry *base);
extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
diff --git a/include/linux/net.h b/include/linux/net.h
index 0ff950eecc6b..f60fff91e1df 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -70,6 +70,7 @@ enum sock_type {
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};
+#endif /* ARCH_HAS_SOCKET_TYPES */
#define SOCK_MAX (SOCK_PACKET + 1)
/* Mask which covers at least up to SOCK_MASK-1. The
@@ -81,8 +82,7 @@ enum sock_type {
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK O_NONBLOCK
#endif
-
-#endif /* ARCH_HAS_SOCKET_TYPES */
+#define SOCK_COREDUMP O_NOCTTY
/**
* enum sock_shutdown_cmd - Shutdown types
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 71319637a84e..ee03f3cef30c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -213,6 +213,15 @@ struct nfs_server {
char *fscache_uniq; /* Uniquifier (or NULL) */
#endif
+ /* The following #defines numerically match the NFSv4 equivalents */
+#define NFS_FH_NOEXPIRE_WITH_OPEN (0x1)
+#define NFS_FH_VOLATILE_ANY (0x2)
+#define NFS_FH_VOL_MIGRATION (0x4)
+#define NFS_FH_VOL_RENAME (0x8)
+#define NFS_FH_RENAME_UNSAFE (NFS_FH_VOLATILE_ANY | NFS_FH_VOL_RENAME)
+ u32 fh_expire_type; /* V4 bitmask representing file
+ handle volatility type for
+ this filesystem */
u32 pnfs_blksize; /* layout_blksize attr */
#if IS_ENABLED(CONFIG_NFS_V4)
u32 attr_bitmask[3];/* V4 bitmask representing the set
@@ -236,9 +245,6 @@ struct nfs_server {
u32 acl_bitmask; /* V4 bitmask representing the ACEs
that are supported on this
filesystem */
- u32 fh_expire_type; /* V4 bitmask representing file
- handle volatility type for
- this filesystem */
struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */
struct rpc_wait_queue roc_rpcwaitq;
void *pnfs_ld_data; /* per mount point data */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 2479ed10f53e..51308f65b72f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -303,6 +303,7 @@ enum nvme_ctrl_attr {
NVME_CTRL_ATTR_TBKAS = (1 << 6),
NVME_CTRL_ATTR_ELBAS = (1 << 15),
NVME_CTRL_ATTR_RHII = (1 << 18),
+ NVME_CTRL_ATTR_FDPS = (1 << 19),
};
struct nvme_id_ctrl {
@@ -689,6 +690,44 @@ struct nvme_rotational_media_log {
__u8 rsvd24[488];
};
+struct nvme_fdp_config {
+ __u8 flags;
+#define FDPCFG_FDPE (1U << 0)
+ __u8 fdpcidx;
+ __le16 reserved;
+};
+
+struct nvme_fdp_ruh_desc {
+ __u8 ruht;
+ __u8 reserved[3];
+};
+
+struct nvme_fdp_config_desc {
+ __le16 dsze;
+ __u8 fdpa;
+ __u8 vss;
+ __le32 nrg;
+ __le16 nruh;
+ __le16 maxpids;
+ __le32 nns;
+ __le64 runs;
+ __le32 erutl;
+ __u8 rsvd28[36];
+ struct nvme_fdp_ruh_desc ruhs[];
+};
+
+struct nvme_fdp_config_log {
+ __le16 numfdpc;
+ __u8 ver;
+ __u8 rsvd3;
+ __le32 sze;
+ __u8 rsvd8[8];
+ /*
+ * This is followed by variable number of nvme_fdp_config_desc
+ * structures, but sparse doesn't like nested variable sized arrays.
+ */
+};
+
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
@@ -915,6 +954,7 @@ enum nvme_opcode {
nvme_cmd_resv_register = 0x0d,
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
+ nvme_cmd_io_mgmt_recv = 0x12,
nvme_cmd_resv_release = 0x15,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
@@ -936,6 +976,7 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
+ nvme_opcode_name(nvme_cmd_io_mgmt_recv), \
nvme_opcode_name(nvme_cmd_resv_release), \
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
@@ -1087,6 +1128,7 @@ enum {
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_DTYPE_STREAMS = 1 << 4,
+ NVME_RW_DTYPE_DPLCMT = 2 << 4,
NVME_WZ_DEAC = 1 << 9,
};
@@ -1174,6 +1216,38 @@ struct nvme_zone_mgmt_recv_cmd {
__le32 cdw14[2];
};
+struct nvme_io_mgmt_recv_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __le64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __u8 mo;
+ __u8 rsvd11;
+ __u16 mos;
+ __le32 numd;
+ __le32 cdw12[4];
+};
+
+enum {
+ NVME_IO_MGMT_RECV_MO_RUHS = 1,
+};
+
+struct nvme_fdp_ruh_status_desc {
+ __le16 pid;
+ __le16 ruhid;
+ __le32 earutr;
+ __le64 ruamw;
+ __u8 reserved[16];
+};
+
+struct nvme_fdp_ruh_status {
+ __u8 rsvd0[14];
+ __le16 nruhsd;
+ struct nvme_fdp_ruh_status_desc ruhsd[];
+};
+
enum {
NVME_ZRA_ZONE_REPORT = 0,
NVME_ZRASF_ZONE_REPORT_ALL = 0,
@@ -1309,6 +1383,7 @@ enum {
NVME_FEAT_PLM_WINDOW = 0x14,
NVME_FEAT_HOST_BEHAVIOR = 0x16,
NVME_FEAT_SANITIZE = 0x17,
+ NVME_FEAT_FDP = 0x1d,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
@@ -1329,6 +1404,7 @@ enum {
NVME_LOG_ANA = 0x0c,
NVME_LOG_FEATURES = 0x12,
NVME_LOG_RMI = 0x16,
+ NVME_LOG_FDP_CONFIGS = 0x20,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -1923,6 +1999,7 @@ struct nvme_command {
struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
+ struct nvme_io_mgmt_recv_cmd imr;
};
};
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e6a21b62dcce..3b814ce08331 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -615,6 +615,13 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
PAGEFLAG_FALSE(HighMem, highmem)
#endif
+/* Does kmap_local_folio() only allow access to one page of the folio? */
+#ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP
+#define folio_test_partial_kmap(f) true
+#else
+#define folio_test_partial_kmap(f) folio_test_highmem(f)
+#endif
+
#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index c5e9cac0575e..eeeff2a04529 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -79,4 +79,6 @@ static inline void part_stat_set_all(struct block_device *part, int value)
#define part_stat_local_read_cpu(part, field, cpu) \
local_read(&(part_stat_get_cpu(part, field, cpu)))
+unsigned int bdev_count_inflight(struct block_device *part);
+
#endif /* _LINUX_PART_STAT_H */
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index af7d75ede619..288f5235649a 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -43,9 +43,10 @@ is_static struct percpu_rw_semaphore name = { \
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, static)
-extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool);
-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem,
+ bool freezable)
{
might_sleep();
@@ -63,7 +64,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
if (likely(rcu_sync_is_idle(&sem->rss)))
this_cpu_inc(*sem->read_count);
else
- __percpu_down_read(sem, false); /* Unconditional memory barrier */
+ __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */
/*
* The preempt_enable() prevents the compiler from
* bleeding the critical section out.
@@ -71,6 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
preempt_enable();
}
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ percpu_down_read_internal(sem, false);
+}
+
+static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem,
+ bool freeze)
+{
+ percpu_down_read_internal(sem, freeze);
+}
+
static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
bool ret = true;
@@ -82,7 +94,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(rcu_sync_is_idle(&sem->rss)))
this_cpu_inc(*sem->read_count);
else
- ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
+ ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */
preempt_enable();
/*
* The barrier() from preempt_enable() prevents the compiler from
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 52b5ea663b9f..85bf8dd9f087 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -15,11 +15,7 @@
/* enough to cover all DEFINE_PER_CPUs in modules */
#ifdef CONFIG_MODULES
-#ifdef CONFIG_MEM_ALLOC_PROFILING
-#define PERCPU_MODULE_RESERVE (8 << 13)
-#else
#define PERCPU_MODULE_RESERVE (8 << 10)
-#endif
#else
#define PERCPU_MODULE_RESERVE 0
#endif
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index c74077977830..8a7f4f802c57 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -188,6 +188,13 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page)
return tag;
}
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+{
+ if (mem_alloc_profiling_enabled())
+ return __pgalloc_tag_get(page);
+ return NULL;
+}
+
void pgalloc_tag_split(struct folio *folio, int old_order, int new_order);
void pgalloc_tag_swap(struct folio *new, struct folio *old);
@@ -199,6 +206,7 @@ static inline void clear_page_tag_ref(struct page *page) {}
static inline void alloc_tag_sec_init(void) {}
static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {}
static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {}
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
#endif /* CONFIG_MEM_ALLOC_PROFILING */
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 311ecebd7d56..453ae6d8a68d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -77,7 +77,7 @@ struct file;
struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file);
void do_notify_pidfd(struct task_struct *task);
static inline struct pid *get_pid(struct pid *pid)
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 05e6f8f4a026..77e7db194914 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -2,11 +2,19 @@
#ifndef _LINUX_PID_FS_H
#define _LINUX_PID_FS_H
+struct coredump_params;
+
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
void pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
void pidfs_exit(struct task_struct *tsk);
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm);
+#endif
extern const struct dentry_operations pidfs_dentry_operations;
+int pidfs_register_pid(struct pid *pid);
+void pidfs_get_pid(struct pid *pid);
+void pidfs_put_pid(struct pid *pid);
#endif /* _LINUX_PID_FS_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0b273a7b9f01..5f03a39a26f7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -104,10 +104,11 @@ static inline bool shmem_mapping(struct address_space *mapping)
return false;
}
#endif /* CONFIG_SHMEM */
-extern void shmem_unlock_mapping(struct address_space *mapping);
-extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+void shmem_unlock_mapping(struct address_space *mapping);
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
-extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
+int shmem_writeout(struct folio *folio, struct writeback_control *wbc);
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 493d9de4e472..dc6ebaee3d18 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -365,7 +365,7 @@ struct sdw_intel_res {
* on e.g. which machine driver to select (I2S mode, HDaudio or
* SoundWire).
*/
-int sdw_intel_acpi_scan(acpi_handle *parent_handle,
+int sdw_intel_acpi_scan(acpi_handle parent_handle,
struct sdw_intel_acpi_info *info);
void sdw_intel_process_wakeen_event(struct sdw_intel_ctx *ctx);
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 0ba5e49bace4..6e64f0193777 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -249,10 +249,7 @@ struct spi_device {
static_assert((SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK) == 0,
"SPI_MODE_USER_MASK & SPI_MODE_KERNEL_MASK must not overlap");
-static inline struct spi_device *to_spi_device(const struct device *dev)
-{
- return dev ? container_of(dev, struct spi_device, dev) : NULL;
-}
+#define to_spi_device(__dev) container_of_const(__dev, struct spi_device, dev)
/* Most drivers won't need to care about device refcounting */
static inline struct spi_device *spi_dev_get(struct spi_device *spi)
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 522d837a23fa..54bfeeaa0995 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1798,6 +1798,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
void hci_uuids_clear(struct hci_dev *hdev);
void hci_link_keys_clear(struct hci_dev *hdev);
+u8 *hci_conn_key_enc_size(struct hci_conn *conn);
struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr);
struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
bdaddr_t *bdaddr, u8 *val, u8 type,
diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
index c316b551df8d..0ee5bc766810 100644
--- a/include/net/netdev_lock.h
+++ b/include/net/netdev_lock.h
@@ -98,6 +98,9 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
&qdisc_xmit_lock_key); \
}
+#define netdev_lock_dereference(p, dev) \
+ rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock))
+
int netdev_debug_event(struct notifier_block *nb, unsigned long event,
void *ptr);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d48c657191cd..1c05fed05f2b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1031,6 +1031,21 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
return skb;
}
+static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct)
+{
+ struct sk_buff *skb;
+
+ skb = __skb_dequeue(&sch->gso_skb);
+ if (skb) {
+ sch->q.qlen--;
+ return skb;
+ }
+ if (direct)
+ return __qdisc_dequeue_head(&sch->q);
+ else
+ return sch->dequeue(sch);
+}
+
static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 39365fd2ea17..06ab2a3d2ebd 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -236,7 +236,6 @@ struct xfrm_state {
/* Data for encapsulator */
struct xfrm_encap_tmpl *encap;
- struct sock __rcu *encap_sk;
/* NAT keepalive */
u32 nat_keepalive_interval; /* seconds */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 26bc23419cfd..c53812b9026f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -670,8 +670,6 @@ struct Scsi_Host {
/* The transport requires the LUN bits NOT to be stored in CDB[1] */
unsigned no_scsi2_lun_in_cdb:1;
- unsigned no_highmem:1;
-
/*
* Optional work queue to be utilized by the transport
*/
diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 8becb4504887..8582d22f3818 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -1404,6 +1404,8 @@ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_s
#define snd_pcm_lib_mmap_iomem NULL
#endif
+void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime);
+
/**
* snd_pcm_limit_isa_dma_size - Get the max size fitting with ISA DMA transfer
* @dma: DMA number
diff --git a/include/sound/ump_msg.h b/include/sound/ump_msg.h
index 72f60ddfea75..9556b4755a1e 100644
--- a/include/sound/ump_msg.h
+++ b/include/sound/ump_msg.h
@@ -604,7 +604,7 @@ struct snd_ump_stream_msg_ep_info {
} __packed;
/* UMP Stream Message: Device Info Notification (128bit) */
-struct snd_ump_stream_msg_devince_info {
+struct snd_ump_stream_msg_device_info {
#ifdef __BIG_ENDIAN_BITFIELD
/* 0 */
u32 type:4;
@@ -754,7 +754,7 @@ struct snd_ump_stream_msg_fb_name {
union snd_ump_stream_msg {
struct snd_ump_stream_msg_ep_discovery ep_discovery;
struct snd_ump_stream_msg_ep_info ep_info;
- struct snd_ump_stream_msg_devince_info device_info;
+ struct snd_ump_stream_msg_device_info device_info;
struct snd_ump_stream_msg_stream_cfg stream_cfg;
struct snd_ump_stream_msg_fb_discovery fb_discovery;
struct snd_ump_stream_msg_fb_info fb_info;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index bd0ea07338eb..14a924c0e303 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -11,7 +11,7 @@
#include <linux/tracepoint.h>
#include <uapi/linux/ioprio.h>
-#define RWBS_LEN 8
+#define RWBS_LEN 9
#define IOPRIO_CLASS_STRINGS \
{ IOPRIO_CLASS_NONE, "none" }, \
@@ -361,21 +361,6 @@ DECLARE_EVENT_CLASS(block_bio,
);
/**
- * block_bio_bounce - used bounce buffer when processing block operation
- * @bio: block operation
- *
- * A bounce buffer was used to handle the block operation @bio in @q.
- * This occurs when hardware limitations prevent a direct transfer of
- * data between the @bio data memory area and the IO device. Use of a
- * bounce buffer requires extra copying of data and decreases
- * performance.
- */
-DEFINE_EVENT(block_bio, block_bio_bounce,
- TP_PROTO(struct bio *bio),
- TP_ARGS(bio)
-);
-
-/**
* block_bio_backmerge - merging block operation to the end of an existing operation
* @bio: new block operation to merge
*
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index fb81c533b310..178ab6f611be 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -645,7 +645,7 @@ TRACE_EVENT(io_uring_short_write,
/*
* io_uring_local_work_run - ran ring local task work
*
- * @tctx: pointer to a io_uring_ctx
+ * @ctx: pointer to an io_ring_ctx
* @count: how many functions it ran
* @loops: how many loops it ran
*
diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index 690621b610e5..1bfb635e309b 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -49,7 +49,7 @@ enum blktrace_act {
__BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
__BLK_TA_INSERT, /* insert request */
__BLK_TA_SPLIT, /* bio was split */
- __BLK_TA_BOUNCE, /* bio was bounced */
+ __BLK_TA_BOUNCE, /* unused, was: bio was bounced */
__BLK_TA_REMAP, /* bio was remapped */
__BLK_TA_ABORT, /* request aborted */
__BLK_TA_DRV_DATA, /* driver-specific binary data */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8f1fc12bac46..cfd17e382082 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -73,6 +73,7 @@ struct io_uring_sqe {
__u32 futex_flags;
__u32 install_fd_flags;
__u32 nop_flags;
+ __u32 pipe_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -93,6 +94,10 @@ struct io_uring_sqe {
__u16 addr_len;
__u16 __pad3[1];
};
+ struct {
+ __u8 write_stream;
+ __u8 __pad4[3];
+ };
};
union {
struct {
@@ -283,6 +288,7 @@ enum io_uring_op {
IORING_OP_EPOLL_WAIT,
IORING_OP_READV_FIXED,
IORING_OP_WRITEV_FIXED,
+ IORING_OP_PIPE,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -988,12 +994,16 @@ struct io_uring_zcrx_offsets {
__u64 __resv[2];
};
+enum io_uring_zcrx_area_flags {
+ IORING_ZCRX_AREA_DMABUF = 1,
+};
+
struct io_uring_zcrx_area_reg {
__u64 addr;
__u64 len;
__u64 rq_area_token;
__u32 flags;
- __u32 __resv1;
+ __u32 dmabuf_fd;
__u64 __resv2[2];
};
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 2970ef44655a..c27a4e238e4b 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -12,7 +12,7 @@
#define PIDFD_THREAD O_EXCL
#ifdef __KERNEL__
#include <linux/sched.h>
-#define PIDFD_CLONE CLONE_PIDFD
+#define PIDFD_STALE CLONE_PIDFD
#endif
/* Flags for pidfd_send_signal(). */
@@ -25,10 +25,24 @@
#define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */
#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */
#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */
+#define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
/*
+ * Values for @coredump_mask in pidfd_info.
+ * Only valid if PIDFD_INFO_COREDUMP is set in @mask.
+ *
+ * Note, the @PIDFD_COREDUMP_ROOT flag indicates that the generated
+ * coredump should be treated as sensitive and access should only be
+ * granted to privileged users.
+ */
+#define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */
+#define PIDFD_COREDUMP_SKIP (1U << 1) /* coredumping generation was skipped. */
+#define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */
+#define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */
+
+/*
* The concept of process and threads in userland and the kernel is a confusing
* one - within the kernel every thread is a 'task' with its own individual PID,
* however from userland's point of view threads are grouped by a single PID,
@@ -92,6 +106,8 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
+ __u32 coredump_mask;
+ __u32 __spare1;
};
#define PIDFS_IOCTL_MAGIC 0xFF
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 95762232e018..5929030d4e8b 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
*/
-#define TASKSTATS_VERSION 15
+#define TASKSTATS_VERSION 16
#define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN
* in linux/sched.h */
@@ -72,8 +72,6 @@ struct taskstats {
*/
__u64 cpu_count __attribute__((aligned(8)));
__u64 cpu_delay_total;
- __u64 cpu_delay_max;
- __u64 cpu_delay_min;
/* Following four fields atomically updated using task->delays->lock */
@@ -82,14 +80,10 @@ struct taskstats {
*/
__u64 blkio_count;
__u64 blkio_delay_total;
- __u64 blkio_delay_max;
- __u64 blkio_delay_min;
/* Delay waiting for page fault I/O (swap in only) */
__u64 swapin_count;
__u64 swapin_delay_total;
- __u64 swapin_delay_max;
- __u64 swapin_delay_min;
/* cpu "wall-clock" running time
* On some architectures, value will adjust for cpu time stolen
@@ -172,14 +166,11 @@ struct taskstats {
/* Delay waiting for memory reclaim */
__u64 freepages_count;
__u64 freepages_delay_total;
- __u64 freepages_delay_max;
- __u64 freepages_delay_min;
+
/* Delay waiting for thrashing page */
__u64 thrashing_count;
__u64 thrashing_delay_total;
- __u64 thrashing_delay_max;
- __u64 thrashing_delay_min;
/* v10: 64-bit btime to avoid overflow */
__u64 ac_btime64; /* 64-bit begin time */
@@ -187,8 +178,6 @@ struct taskstats {
/* v11: Delay waiting for memory compact */
__u64 compact_count;
__u64 compact_delay_total;
- __u64 compact_delay_max;
- __u64 compact_delay_min;
/* v12 begin */
__u32 ac_tgid; /* thread group ID */
@@ -210,15 +199,37 @@ struct taskstats {
/* v13: Delay waiting for write-protect copy */
__u64 wpcopy_count;
__u64 wpcopy_delay_total;
- __u64 wpcopy_delay_max;
- __u64 wpcopy_delay_min;
/* v14: Delay waiting for IRQ/SOFTIRQ */
__u64 irq_count;
__u64 irq_delay_total;
- __u64 irq_delay_max;
- __u64 irq_delay_min;
- /* v15: add Delay max */
+
+ /* v15: add Delay max and Delay min */
+
+ /* v16: move Delay max and Delay min to the end of taskstat */
+ __u64 cpu_delay_max;
+ __u64 cpu_delay_min;
+
+ __u64 blkio_delay_max;
+ __u64 blkio_delay_min;
+
+ __u64 swapin_delay_max;
+ __u64 swapin_delay_min;
+
+ __u64 freepages_delay_max;
+ __u64 freepages_delay_min;
+
+ __u64 thrashing_delay_max;
+ __u64 thrashing_delay_min;
+
+ __u64 compact_delay_max;
+ __u64 compact_delay_min;
+
+ __u64 wpcopy_delay_max;
+ __u64 wpcopy_delay_min;
+
+ __u64 irq_delay_max;
+ __u64 irq_delay_min;
};
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 583b86681c93..56c7e3fc666f 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -51,6 +51,10 @@
_IOR('u', 0x13, struct ublksrv_ctrl_cmd)
#define UBLK_U_CMD_DEL_DEV_ASYNC \
_IOR('u', 0x14, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_UPDATE_SIZE \
+ _IOWR('u', 0x15, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_QUIESCE_DEV \
+ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
/*
* 64bits are enough now, and it should be easy to extend in case of
@@ -211,6 +215,63 @@
*/
#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9)
+/*
+ * Resizing a block device is possible with UBLK_U_CMD_UPDATE_SIZE
+ * New size is passed in cmd->data[0] and is in units of sectors
+ */
+#define UBLK_F_UPDATE_SIZE (1ULL << 10)
+
+/*
+ * request buffer is registered automatically to uring_cmd's io_uring
+ * context before delivering this io command to ublk server, meantime
+ * it is un-registered automatically when completing this io command.
+ *
+ * For using this feature:
+ *
+ * - ublk server has to create sparse buffer table on the same `io_ring_ctx`
+ * for issuing `UBLK_IO_FETCH_REQ` and `UBLK_IO_COMMIT_AND_FETCH_REQ`.
+ * If uring_cmd isn't issued on same `io_ring_ctx`, it is ublk server's
+ * responsibility to unregister the buffer by issuing `IO_UNREGISTER_IO_BUF`
+ * manually, otherwise this ublk request won't complete.
+ *
+ * - ublk server passes auto buf register data via uring_cmd's sqe->addr,
+ * `struct ublk_auto_buf_reg` is populated from sqe->addr, please see
+ * the definition of ublk_sqe_addr_to_auto_buf_reg()
+ *
+ * - pass buffer index from `ublk_auto_buf_reg.index`
+ *
+ * - all reserved fields in `ublk_auto_buf_reg` need to be zeroed
+ *
+ * - pass flags from `ublk_auto_buf_reg.flags` if needed
+ *
+ * This way avoids extra cost from two uring_cmd, but also simplifies backend
+ * implementation, such as, the dependency on IO_REGISTER_IO_BUF and
+ * IO_UNREGISTER_IO_BUF becomes not necessary.
+ *
+ * If wrong data or flags are provided, both IO_FETCH_REQ and
+ * IO_COMMIT_AND_FETCH_REQ are failed, for the latter, the ublk IO request
+ * won't be completed until new IO_COMMIT_AND_FETCH_REQ command is issued
+ * successfully
+ */
+#define UBLK_F_AUTO_BUF_REG (1ULL << 11)
+
+/*
+ * Control command `UBLK_U_CMD_QUIESCE_DEV` is added for quiescing device,
+ * which state can be transitioned to `UBLK_S_DEV_QUIESCED` or
+ * `UBLK_S_DEV_FAIL_IO` finally, and it needs ublk server cooperation for
+ * handling `UBLK_IO_RES_ABORT` correctly.
+ *
+ * Typical use case is for supporting to upgrade ublk server application,
+ * meantime keep ublk block device persistent during the period.
+ *
+ * This feature is only available when UBLK_F_USER_RECOVERY is enabled.
+ *
+ * Note, this command returns -EBUSY in case that all IO commands are being
+ * handled by ublk server and not completed in specified time period which
+ * is passed from the control command parameter.
+ */
+#define UBLK_F_QUIESCE (1ULL << 12)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
@@ -297,6 +358,17 @@ struct ublksrv_ctrl_dev_info {
#define UBLK_IO_F_FUA (1U << 13)
#define UBLK_IO_F_NOUNMAP (1U << 15)
#define UBLK_IO_F_SWAP (1U << 16)
+/*
+ * For UBLK_F_AUTO_BUF_REG & UBLK_AUTO_BUF_REG_FALLBACK only.
+ *
+ * This flag is set if auto buffer register is failed & ublk server passes
+ * UBLK_AUTO_BUF_REG_FALLBACK, and ublk server need to register buffer
+ * manually for handling the delivered IO command if this flag is observed
+ *
+ * ublk server has to check this flag if UBLK_AUTO_BUF_REG_FALLBACK is
+ * passed in.
+ */
+#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
/*
* io cmd is described by this structure, and stored in share memory, indexed
@@ -331,6 +403,62 @@ static inline __u32 ublksrv_get_flags(const struct ublksrv_io_desc *iod)
return iod->op_flags >> 8;
}
+/*
+ * If this flag is set, fallback by completing the uring_cmd and setting
+ * `UBLK_IO_F_NEED_REG_BUF` in case of auto-buf-register failure;
+ * otherwise the client ublk request is failed silently
+ *
+ * If ublk server passes this flag, it has to check if UBLK_IO_F_NEED_REG_BUF
+ * is set in `ublksrv_io_desc.op_flags`. If UBLK_IO_F_NEED_REG_BUF is set,
+ * ublk server needs to register io buffer manually for handling IO command.
+ */
+#define UBLK_AUTO_BUF_REG_FALLBACK (1 << 0)
+#define UBLK_AUTO_BUF_REG_F_MASK UBLK_AUTO_BUF_REG_FALLBACK
+
+struct ublk_auto_buf_reg {
+ /* index for registering the delivered request buffer */
+ __u16 index;
+ __u8 flags;
+ __u8 reserved0;
+
+ /*
+ * io_ring FD can be passed via the reserve field in future for
+ * supporting to register io buffer to external io_uring
+ */
+ __u32 reserved1;
+};
+
+/*
+ * For UBLK_F_AUTO_BUF_REG, auto buffer register data is carried via
+ * uring_cmd's sqe->addr:
+ *
+ * - bit0 ~ bit15: buffer index
+ * - bit16 ~ bit23: flags
+ * - bit24 ~ bit31: reserved0
+ * - bit32 ~ bit63: reserved1
+ */
+static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg(
+ __u64 sqe_addr)
+{
+ struct ublk_auto_buf_reg reg = {
+ .index = sqe_addr & 0xffff,
+ .flags = (sqe_addr >> 16) & 0xff,
+ .reserved0 = (sqe_addr >> 24) & 0xff,
+ .reserved1 = sqe_addr >> 32,
+ };
+
+ return reg;
+}
+
+static inline __u64
+ublk_auto_buf_reg_to_sqe_addr(const struct ublk_auto_buf_reg *buf)
+{
+ __u64 addr = buf->index | (__u64)buf->flags << 16 | (__u64)buf->reserved0 << 24 |
+ (__u64)buf->reserved1 << 32;
+
+ return addr;
+}
+
/* issued to ublk driver via /dev/ublkcN */
struct ublksrv_io_cmd {
__u16 q_id;
diff --git a/init/Kconfig b/init/Kconfig
index bf3a920064be..127c4a5f66b7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -477,16 +477,6 @@ config CROSS_MEMORY_ATTACH
to directly read from or write to another process' address space.
See the man page for more details.
-config USELIB
- bool "uselib syscall (for libc5 and earlier)"
- default ALPHA || M68K || SPARC
- help
- This option enables the uselib syscall, a system call used in the
- dynamic linker from libc5 and earlier. glibc does not use this
- system call. If you intend to run programs built on libc5 or
- earlier, you may need to enable this syscall. Current systems
- running glibc can safely disable this.
-
config AUDIT
bool "Auditing support"
depends on NET
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3e28a741ca15..d97c6b51d584 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,11 +7,11 @@ GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
- tctx.o filetable.o rw.o net.o poll.o \
+ tctx.o filetable.o rw.o poll.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
- statx.o timeout.o fdinfo.o cancel.o \
+ statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
@@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
+obj-$(CONFIG_NET) += net.o cmd_net.o
+obj-$(CONFIG_PROC_FS) += fdinfo.o
diff --git a/io_uring/advise.c b/io_uring/advise.c
index cb7b881665e5..0073f74e3658 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
#else
return -EOPNOTSUPP;
#endif
@@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 0870060bac7c..6d57602304df 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -229,7 +229,7 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
new file mode 100644
index 000000000000..e99170c7d41a
--- /dev/null
+++ b/io_uring/cmd_net.c
@@ -0,0 +1,83 @@
+#include <asm/ioctls.h>
+#include <linux/io_uring/net.h>
+#include <net/sock.h>
+
+#include "uring_cmd.h"
+
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optlen, optname, level, err;
+ void __user *optval;
+
+ level = READ_ONCE(sqe->level);
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+
+ err = do_sock_getsockopt(sock, compat, level, optname,
+ USER_SOCKPTR(optval),
+ KERNEL_SOCKPTR(&optlen));
+ if (err)
+ return err;
+
+ /* On success, return optlen */
+ return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optname, optlen, level;
+ void __user *optval;
+ sockptr_t optval_s;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+ level = READ_ONCE(sqe->level);
+ optval_s = USER_SOCKPTR(optval);
+
+ return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+ optlen);
+}
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct socket *sock = cmd->file->private_data;
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ switch (cmd->cmd_op) {
+ case SOCKET_URING_OP_SIOCINQ:
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_GETSOCKOPT:
+ return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_SETSOCKOPT:
+ return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 6d2c48ba1923..8d4610246ba0 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 100d5da94cb9..78f8ab7db104 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_put(ev_fd);
}
-static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
-{
- if (put_ref)
- io_eventfd_put(ev_fd);
- rcu_read_unlock();
-}
-
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
@@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
- * an async worker. If ev_fd isn't valid, obviously return false.
+ * an async worker.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
- if (ev_fd)
- return !ev_fd->eventfd_async || io_wq_current_is_worker();
- return false;
+ return !ev_fd->eventfd_async || io_wq_current_is_worker();
}
-/*
- * On success, returns with an ev_fd reference grabbed and the RCU read
- * lock held.
- */
-static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
+ bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return NULL;
-
- rcu_read_lock();
+ return;
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
+ guard(rcu)();
ev_fd = rcu_dereference(ctx->io_ev_fd);
-
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
- if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
- return ev_fd;
-
- rcu_read_unlock();
- return NULL;
-}
-
-void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd)
- io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
-}
-
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd) {
- bool skip, put_ref = true;
+ if (!ev_fd)
+ return;
+ if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
+ return;
+ if (cqe_event) {
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
@@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
-
- if (!skip)
- put_ref = __io_eventfd_signal(ev_fd);
-
- io_eventfd_release(ev_fd, put_ref);
}
+
+ if (skip || __io_eventfd_signal(ev_fd))
+ io_eventfd_put(ev_fd);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
index d394f49c6321..e2f1985c2cf9 100644
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
-void io_eventfd_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 9414ca6d101c..e9355276ab5d 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,37 +15,6 @@
#include "cancel.h"
#include "rsrc.h"
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- seq_put_hex_ll(m, NULL, cap.val, 16);
- seq_putc(m, '\n');
- return 0;
-}
-
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m,
@@ -86,13 +55,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
}
#endif
-/*
- * Caller holds a reference to the file already, we don't need to do
- * anything else to get an extra reference.
- */
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
+static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
- struct io_ring_ctx *ctx = file->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
struct rusage sq_usage;
@@ -106,7 +70,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
u64 sq_total_time = 0, sq_work_time = 0;
- bool has_lock;
unsigned int i;
if (ctx->flags & IORING_SETUP_CQE32)
@@ -176,15 +139,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "\n");
}
- /*
- * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
- * since fdinfo case grabs it in the opposite direction of normal use
- * cases. If we fail to get the lock, we just don't iterate any
- * structures that could be going away outside the io_uring mutex.
- */
- has_lock = mutex_trylock(&ctx->uring_lock);
-
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sq = ctx->sq_data;
/*
@@ -206,7 +161,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
- for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
+ for (i = 0; i < ctx->file_table.data.nr; i++) {
struct file *f = NULL;
if (ctx->file_table.data.nodes[i])
@@ -218,7 +173,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
}
}
seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
- for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+ for (i = 0; i < ctx->buf_table.nr; i++) {
struct io_mapped_ubuf *buf = NULL;
if (ctx->buf_table.nodes[i])
@@ -228,17 +183,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
else
seq_printf(m, "%5u: <none>\n", i);
}
- if (has_lock && !xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
seq_puts(m, "PollList:\n");
- for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
+ for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
struct io_kiocb *req;
@@ -247,9 +194,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
task_work_pending(req->tctx->task));
}
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
-
seq_puts(m, "CqOverflowList:\n");
spin_lock(&ctx->completion_lock);
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
@@ -262,4 +206,22 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
spin_unlock(&ctx->completion_lock);
napi_show_fdinfo(ctx, m);
}
-#endif
+
+/*
+ * Caller holds a reference to the file already, we don't need to do
+ * anything else to get an extra reference.
+ */
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ /*
+ * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+ * since fdinfo case grabs it in the opposite direction of normal use
+ * cases.
+ */
+ if (mutex_trylock(&ctx->uring_lock)) {
+ __io_uring_show_fdinfo(ctx, m);
+ mutex_unlock(&ctx->uring_lock);
+ }
+}
diff --git a/io_uring/fs.c b/io_uring/fs.c
index eccea851dd5a..37079a414eab 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_renameat_cleanup(struct io_kiocb *req)
@@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
@@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
@@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_link_cleanup(struct io_kiocb *req)
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 0ea4820cd8ff..b34695022baa 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
kfree(futexv);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
- return IOU_OK;
+ return IOU_COMPLETE;
}
/*
@@ -311,7 +311,7 @@ done:
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(ifd);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
@@ -328,5 +328,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 04a75d666195..cd1fcb115739 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -114,9 +114,6 @@ enum {
struct io_wq {
unsigned long state;
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
struct io_wq_hash *hash;
atomic_t worker_refs;
@@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);
+static inline unsigned int __io_get_work_hash(unsigned int work_flags)
+{
+ return work_flags >> IO_WQ_HASH_SHIFT;
+}
+
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return __io_get_work_hash(atomic_read(&work->flags));
+}
+
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@@ -412,6 +419,30 @@ fail:
return false;
}
+/* Defer if current and next work are both hashed to the same chain */
+static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct)
+{
+ unsigned int hash, work_flags;
+ struct io_wq_work *next;
+
+ lockdep_assert_held(&acct->lock);
+
+ work_flags = atomic_read(&work->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+
+ /* should not happen, io_acct_run_queue() said we had work */
+ if (wq_list_empty(&acct->work_list))
+ return true;
+
+ hash = __io_get_work_hash(work_flags);
+ next = container_of(acct->work_list.first, struct io_wq_work, list);
+ work_flags = atomic_read(&next->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+ return hash == __io_get_work_hash(work_flags);
+}
+
static void io_wq_dec_running(struct io_worker *worker)
{
struct io_wq_acct *acct = io_wq_get_acct(worker);
@@ -422,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!atomic_dec_and_test(&acct->nr_running))
return;
+ if (!worker->cur_work)
+ return;
if (!io_acct_run_queue(acct))
return;
+ if (io_wq_hash_defer(worker->cur_work, acct)) {
+ raw_spin_unlock(&acct->lock);
+ return;
+ }
raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running);
@@ -457,16 +494,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker)
}
}
-static inline unsigned int __io_get_work_hash(unsigned int work_flags)
-{
- return work_flags >> IO_WQ_HASH_SHIFT;
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
- return __io_get_work_hash(atomic_read(&work->flags));
-}
-
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
{
bool ret = false;
@@ -612,10 +639,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
if (do_kill &&
(work_flags & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
+ io_wq_submit_work(work);
io_assign_current_work(worker, NULL);
- linked = wq->free_work(work);
+ linked = io_wq_free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@@ -934,8 +961,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
- work = wq->free_work(work);
+ io_wq_submit_work(work);
+ work = io_wq_free_work(work);
} while (work);
}
@@ -1195,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, i;
struct io_wq *wq;
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
@@ -1206,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
ret = -ENOMEM;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index d4fb2940e435..774abab54732 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,9 +21,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
struct io_wq_hash {
refcount_t refs;
unsigned long map;
@@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 541e65a1eebf..c7a9cecf528e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
- u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_HLIST_HEAD(&ctx->waitid_list);
+ xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
@@ -380,25 +381,6 @@ err:
return NULL;
}
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
+static unsigned io_linked_nr(struct io_kiocb *req)
+{
+ struct io_kiocb *tmp;
+ unsigned nr = 0;
+
+ io_for_each_link(tmp, req)
+ nr++;
+ return nr;
+}
+
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
- spin_lock(&ctx->completion_lock);
+ bool drain_seen = false, first = true;
+
+ lockdep_assert_held(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
- if (req_need_defer(de->req, de->seq))
- break;
+ drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
+ if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+ return;
+
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
+ first = false;
}
- spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
if (ctx->has_evfd)
- io_eventfd_flush_signal(ctx);
+ io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
@@ -636,6 +633,7 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
* to care for a non-real case.
*/
if (need_resched()) {
+ ctx->cqe_sentinel = ctx->cqe_cached;
io_cq_unlock_post(ctx);
mutex_unlock(&ctx->uring_lock);
cond_resched();
@@ -700,27 +698,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
}
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags, u64 extra1, u64 extra2)
+static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
+ struct io_overflow_cqe *ocqe)
{
- struct io_overflow_cqe *ocqe;
- size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-
lockdep_assert_held(&ctx->completion_lock);
- if (is_cqe32)
- ocq_size += sizeof(struct io_uring_cqe);
-
- ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
- trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
+ struct io_rings *r = ctx->rings;
+
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
- io_account_cq_overflow(ctx);
+ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
@@ -729,23 +720,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- if (is_cqe32) {
- ocqe->cqe.big_cqe[0] = extra1;
- ocqe->cqe.big_cqe[1] = extra2;
- }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
-static void io_req_cqe_overflow(struct io_kiocb *req)
+static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe, gfp_t gfp)
{
- io_cqring_event_overflow(req->ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- req->big_cqe.extra1, req->big_cqe.extra2);
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
+ struct io_overflow_cqe *ocqe;
+ size_t ocq_size = sizeof(struct io_overflow_cqe);
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+ if (is_cqe32)
+ ocq_size += sizeof(struct io_uring_cqe);
+
+ ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
+ if (ocqe) {
+ ocqe->cqe.user_data = cqe->user_data;
+ ocqe->cqe.res = cqe->res;
+ ocqe->cqe.flags = cqe->flags;
+ if (is_cqe32 && big_cqe) {
+ ocqe->cqe.big_cqe[0] = big_cqe->extra1;
+ ocqe->cqe.big_cqe[1] = big_cqe->extra2;
+ }
+ }
+ if (big_cqe)
+ big_cqe->extra1 = big_cqe->extra2 = 0;
+ return ocqe;
}
/*
@@ -790,13 +793,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
{
struct io_uring_cqe *cqe;
- ctx->cq_extra++;
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
if (likely(io_get_cqe(ctx, &cqe))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@@ -813,14 +809,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
+static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
+{
+ return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
+}
+
+static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL);
+ spin_lock(&ctx->completion_lock);
+ io_cqring_add_overflow(ctx, ocqe);
+ spin_unlock(&ctx->completion_lock);
+}
+
+static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
+ return io_cqring_add_overflow(ctx, ocqe);
+}
+
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
- if (!filled)
- filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+ if (unlikely(!filled)) {
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ filled = io_cqe_overflow_locked(ctx, &cqe, NULL);
+ }
io_cq_unlock_post(ctx);
return filled;
}
@@ -831,10 +856,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert(ctx->lockless_cq);
+
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
- spin_lock(&ctx->completion_lock);
- io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
- spin_unlock(&ctx->completion_lock);
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ io_cqe_overflow(ctx, &cqe, NULL);
}
ctx->submit_state.cq_flush = true;
}
@@ -924,22 +952,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
}
/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- req->ctx = ctx;
- req->buf_node = NULL;
- req->file_node = NULL;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- memset(&req->cqe, 0, sizeof(req->cqe));
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
-}
-
-/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
* Because of that, io_alloc_req() should be called only under ->uring_lock
@@ -948,7 +960,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO;
void *reqs[IO_REQ_ALLOC_BATCH];
int ret;
@@ -966,10 +978,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
}
percpu_ref_get_many(&ctx->refs, ret);
+ ctx->nr_req_allocated += ret;
+
while (ret--) {
struct io_kiocb *req = reqs[ret];
- io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
}
return true;
@@ -1191,7 +1204,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
- io_eventfd_signal(ctx);
+ io_eventfd_signal(ctx, false);
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
@@ -1383,6 +1396,16 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
+{
+ if (req->file_node) {
+ io_put_rsrc_node(req->ctx, req->file_node);
+ req->file_node = NULL;
+ }
+ if (req->flags & REQ_F_BUF_NODE)
+ io_put_rsrc_node(req->ctx, req->buf_node);
+}
+
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@@ -1443,13 +1466,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
- if (ctx->lockless_cq) {
- spin_lock(&ctx->completion_lock);
- io_req_cqe_overflow(req);
- spin_unlock(&ctx->completion_lock);
- } else {
- io_req_cqe_overflow(req);
- }
+ if (ctx->lockless_cq)
+ io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
+ else
+ io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
}
}
__io_cq_unlock_post(ctx);
@@ -1458,6 +1478,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
+
+ if (unlikely(ctx->drain_active))
+ io_queue_deferred(ctx);
+
ctx->submit_state.cq_flush = false;
}
@@ -1645,56 +1669,28 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
- struct io_kiocb *cur;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
- return seq;
-}
-
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool drain = req->flags & IOSQE_IO_DRAIN;
struct io_defer_entry *de;
- int ret;
- u32 seq = io_get_sequence(req);
-
- /* Still need defer if there is pending req in defer list. */
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
-queue:
- ctx->drain_active = false;
- io_req_task_queue(req);
- return;
- }
- spin_unlock(&ctx->completion_lock);
- io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
+ de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
- ret = -ENOMEM;
- io_req_defer_failed(req, ret);
+ io_req_defer_failed(req, -ENOMEM);
return;
}
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
- kfree(de);
- goto queue;
- }
-
+ io_prep_async_link(req);
trace_io_uring_defer(req);
de->req = req;
- de->seq = seq;
+
+ ctx->nr_drained += io_linked_nr(req);
list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock(&ctx->completion_lock);
+ io_queue_deferred(ctx);
+ if (!drain && list_empty(&ctx->defer_list))
+ ctx->drain_active = false;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -1756,7 +1752,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_issue_sqe(req, issue_flags, def);
- if (ret == IOU_OK) {
+ if (ret == IOU_COMPLETE) {
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_defer(req);
else
@@ -1815,7 +1811,7 @@ void io_wq_submit_work(struct io_wq_work *work)
bool needs_poll = false;
int ret = 0, err = -ECANCELED;
- /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
+ /* one will be dropped by io_wq_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
__io_req_set_refcount(req, 2);
else
@@ -1913,7 +1909,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
if (node) {
- io_req_assign_rsrc_node(&req->file_node, node);
+ node->refs++;
+ req->file_node = node;
req->flags |= io_slot_flags(node);
file = io_slot_file(node);
}
@@ -2046,7 +2043,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
int personality;
u8 opcode;
- /* req is partially pre-initialised, see io_preinit_req() */
+ req->ctx = ctx;
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
sqe_flags = READ_ONCE(sqe->flags);
@@ -2277,10 +2274,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
(!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
- /* drop invalid entries */
- spin_lock(&ctx->completion_lock);
- ctx->cq_extra--;
- spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
@@ -2698,21 +2691,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
return off;
}
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
int nr = 0;
- mutex_lock(&ctx->uring_lock);
-
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
kmem_cache_free(req_cachep, req);
nr++;
}
- if (nr)
+ if (nr) {
+ ctx->nr_req_allocated -= nr;
percpu_ref_put_many(&ctx->refs, nr);
- mutex_unlock(&ctx->uring_lock);
+ }
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+ guard(mutex)(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -2748,6 +2746,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
+
+ WARN_ON_ONCE(ctx->nr_req_allocated);
+
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
io_napi_free(ctx);
@@ -2882,7 +2883,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
- if (ctx->ifq) {
+ if (!xa_empty(&ctx->zcrx_ctxs)) {
mutex_lock(&ctx->uring_lock);
io_shutdown_zcrx_ifqs(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -3014,20 +3015,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
- spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
- spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
@@ -3102,8 +3102,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
- ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
+ ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
ret |= io_poll_remove_all(ctx, tctx, cancel_all);
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
@@ -3913,6 +3913,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
+ BUILD_BUG_SQE_ELEM(44, __u8, write_stream);
+ BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821..0ea7a435d1de 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -19,7 +19,6 @@
#endif
enum {
- IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
IOU_COMPLETE = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@@ -196,7 +195,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
{
io_lockdep_assert_cq_locked(ctx);
- ctx->cq_extra++;
ctx->submit_state.cq_flush = true;
return io_get_cqe(ctx, cqe_ret);
}
@@ -414,7 +412,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 953d5e742569..8cce3ebd813f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
return;
- req->buf_index = req->kbuf->bgid;
req->flags &= ~REQ_F_BUFFER_SELECTED;
kfree(req->kbuf);
req->kbuf = NULL;
@@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return true;
@@ -193,7 +191,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+ unsigned buf_group, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@@ -201,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
io_ring_submit_lock(req->ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
@@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
goto out_unlock;
@@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
lockdep_assert_held(&ctx->uring_lock);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
return -ENOENT;
@@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
struct io_buffer_list *bl = req->buf_list;
bool ret = true;
- if (bl) {
+ if (bl)
ret = io_kbuf_commit(req, bl, len, nr);
- req->buf_index = bl->bgid;
- }
+
req->flags &= ~REQ_F_BUFFER_RING;
return ret;
}
@@ -379,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
return ret;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned nbufs)
+static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ unsigned long nbufs)
{
- unsigned i = 0;
-
- /* shouldn't happen */
- if (!nbufs)
- return 0;
-
- if (bl->flags & IOBL_BUF_RING) {
- i = bl->buf_ring->tail - bl->head;
- io_free_region(ctx, &bl->region);
- /* make sure it's seen as empty */
- INIT_LIST_HEAD(&bl->buf_list);
- bl->flags &= ~IOBL_BUF_RING;
- return i;
- }
+ unsigned long i = 0;
+ struct io_buffer *nxt;
/* protects io_buffers_cache */
lockdep_assert_held(&ctx->uring_lock);
+ WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
- while (!list_empty(&bl->buf_list)) {
- struct io_buffer *nxt;
-
+ for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
kfree(nxt);
-
- if (++i == nbufs)
- return i;
cond_resched();
}
-
return i;
}
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- __io_remove_buffers(ctx, bl, -1U);
+ if (bl->flags & IOBL_BUF_RING)
+ io_free_region(ctx, &bl->region);
+ else
+ io_remove_buffers_legacy(ctx, bl, -1U);
+
kfree(bl);
}
@@ -465,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- ret = -ENOENT;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (bl) {
- ret = -EINVAL;
- /* can't use provide/remove buffers command on mapped buffers */
- if (!(bl->flags & IOBL_BUF_RING))
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
- }
- io_ring_submit_unlock(ctx, issue_flags);
- if (ret < 0)
- req_set_fail(req);
- io_req_set_res(req, ret, 0);
- return IOU_OK;
-}
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
@@ -512,8 +473,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
-
- size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
@@ -552,49 +511,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_manage_buffers_legacy(struct io_kiocb *req,
+ struct io_buffer_list *bl)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
+ int ret;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (unlikely(!bl)) {
+ if (!bl) {
+ if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
+ return -ENOENT;
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
- if (!bl) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!bl)
+ return -ENOMEM;
+
INIT_LIST_HEAD(&bl->buf_list);
- ret = io_buffer_add_list(ctx, bl, p->bgid);
+ ret = io_buffer_add_list(req->ctx, bl, p->bgid);
if (ret) {
kfree(bl);
- goto err;
+ return ret;
}
}
- /* can't add buffers via this command for a mapped buffer ring */
- if (bl->flags & IOBL_BUF_RING) {
- ret = -EINVAL;
- goto err;
- }
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (bl->flags & IOBL_BUF_RING)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
+ return io_add_buffers(req->ctx, p, bl);
+ return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
+}
+
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret;
- ret = io_add_buffers(ctx, p, bl);
-err:
+ io_ring_submit_lock(ctx, issue_flags);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ ret = __io_manage_buffers_legacy(req, bl);
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
- struct io_buffer_list *bl, *free_bl = NULL;
+ struct io_buffer_list *bl;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
@@ -605,8 +571,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
-
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
@@ -624,7 +589,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
io_destroy_bl(ctx, bl);
}
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl)
return -ENOMEM;
@@ -669,7 +634,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
fail:
io_free_region(ctx, &bl->region);
- kfree(free_bl);
+ kfree(bl);
return ret;
}
@@ -682,9 +647,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
- return -EINVAL;
- if (reg.flags)
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
@@ -704,14 +667,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
- int i;
if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
return -EFAULT;
-
- for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
- if (buf_status.resv[i])
- return -EINVAL;
+ if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
+ return -EINVAL;
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 2ec0b983ce24..4d2c209d1a41 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,20 +55,19 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
+ unsigned buf_group;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags);
+ unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags);
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
@@ -94,7 +93,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 76fcc79656b0..725dc0bec24c 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -13,6 +13,7 @@
#include "memmap.h"
#include "kbuf.h"
#include "rsrc.h"
+#include "zcrx.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@@ -116,7 +117,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr)
void *ptr;
if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
- if (ifd.nr_folios == 1) {
+ if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) {
mr->ptr = page_address(mr->pages[0]);
return 0;
}
@@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
loff_t pgoff)
{
loff_t offset = pgoff << PAGE_SHIFT;
- unsigned int bgid;
+ unsigned int id;
+
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
@@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
case IORING_OFF_SQES:
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
- bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- return io_pbuf_get_region(ctx, bgid);
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ return io_pbuf_get_region(ctx, id);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
case IORING_MAP_OFF_ZCRX_REGION:
- return &ctx->zcrx_region;
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
+ return io_zcrx_get_region(ctx, id);
}
return NULL;
}
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index dad0aa5b1b45..08419684e4bc 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -4,7 +4,9 @@
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+#define IORING_OFF_ZCRX_SHIFT 16
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 50a958e9c921..71400d6cefc8 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -328,7 +328,7 @@ done:
req_set_fail(req);
}
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
diff --git a/io_uring/net.c b/io_uring/net.c
index 24040bc3916a..d13f3e8f6c72 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -18,7 +18,6 @@
#include "rsrc.h"
#include "zcrx.h"
-#if defined(CONFIG_NET)
struct io_shutdown {
struct file *file;
int how;
@@ -129,7 +128,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_shutdown_sock(sock, shutdown->how);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static bool io_net_retry(struct socket *sock, int flags)
@@ -190,7 +189,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
sr->done_io = 0;
sr->retry = false;
sr->len = 0; /* get from the provided buffer */
- req->buf_index = sr->buf_group;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -359,15 +357,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_name = &kmsg->addr;
kmsg->msg.msg_namelen = addr_len;
}
- if (sr->flags & IORING_RECVSEND_FIXED_BUF)
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+ req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret < 0))
- return ret;
}
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
}
static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,13 +405,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ sr->buf_group = req->buf_index;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_SENDMSG)
return -EINVAL;
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
sr->msg_flags |= MSG_WAITALL;
- sr->buf_group = req->buf_index;
req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
@@ -507,7 +502,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
/* Otherwise stop bundle and use the current result. */
finish:
io_req_set_res(req, *ret, cflags);
- *ret = IOU_OK;
+ *ret = IOU_COMPLETE;
return true;
}
@@ -558,7 +553,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
else if (sr->done_io)
ret = sr->done_io;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
@@ -571,6 +566,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg;
- int ret;
kmsg = io_msg_alloc_async(req);
if (unlikely(!kmsg))
@@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
kmsg->msg.msg_iocb = NULL;
kmsg->msg.msg_ubuf = NULL;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret))
- return ret;
- }
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
}
return io_recvmsg_copy_hdr(req, kmsg);
@@ -827,18 +819,24 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
+ size_t this_ret = *ret - sr->done_io;
+
+ cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
if (sr->retry)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
- /* if more is available, retry and append to this one */
- if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) {
+ /*
+ * If more is available AND it was a full transfer, retry and
+ * append to this one
+ */
+ if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 &&
+ !iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
- sr->done_io += *ret;
+ sr->done_io += this_ret;
sr->retry = true;
return false;
}
@@ -985,7 +983,7 @@ retry_multishot:
void __user *buf;
size_t len = sr->len;
- buf = io_buffer_select(req, &len, issue_flags);
+ buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
@@ -1063,6 +1061,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.iovs = &kmsg->fast_iov,
.nr_iovs = 1,
.mode = KBUF_MODE_EXPAND,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -1095,7 +1094,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
void __user *buf;
*len = sr->len;
- buf = io_buffer_select(req, len, issue_flags);
+ buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
sr->buf = buf;
@@ -1191,16 +1190,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
unsigned ifq_idx;
- if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr ||
- sqe->addr3))
+ if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
return -EINVAL;
ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
- if (ifq_idx != 0)
- return -EINVAL;
- zc->ifq = req->ctx->ifq;
+ zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
if (!zc->ifq)
return -EINVAL;
+
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
zc->msg_flags = READ_ONCE(sqe->msg_flags);
@@ -1321,8 +1318,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -ENOMEM;
if (req->opcode == IORING_OP_SEND_ZC) {
- if (zc->flags & IORING_RECVSEND_FIXED_BUF)
- req->flags |= REQ_F_IMPORT_BUFFER;
ret = io_send_setup(req, sqe);
} else {
if (unlikely(sqe->addr2 || sqe->file_index))
@@ -1470,7 +1465,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
@@ -1541,7 +1536,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_sendrecv_fail(struct io_kiocb *req)
@@ -1705,7 +1700,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
sock->file_slot);
}
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1772,7 +1767,7 @@ out:
req_set_fail(req);
io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1846,4 +1841,3 @@ void io_netmsg_cache_free(const void *entry)
io_vec_free(&kmsg->vec);
kfree(kmsg);
}
-#endif
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 28f06285fdc2..6ac2de761fd3 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -68,5 +68,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 7bd92538dccb..9a6f6e92d742 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL;
+ notif->ctx = ctx;
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 489384c0438b..6e0882b051f9 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_provide_buffers_prep,
- .issue = io_provide_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_remove_buffers_prep,
- .issue = io_remove_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_prep_writev_fixed,
.issue = io_write,
},
+ [IORING_OP_PIPE] = {
+ .prep = io_pipe_prep,
+ .issue = io_pipe,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
+ [IORING_OP_PIPE] = {
+ .name = "PIPE",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3357dfa14ca..83e36ad4e31b 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -6,6 +6,8 @@
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/watch_queue.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -169,7 +171,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
@@ -257,7 +259,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -300,5 +302,136 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
+}
+
+struct io_pipe {
+ struct file *file;
+ int __user *fds;
+ int flags;
+ int file_slot;
+ unsigned long nofile;
+};
+
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+
+ if (sqe->fd || sqe->off || sqe->addr3)
+ return -EINVAL;
+
+ p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ p->flags = READ_ONCE(sqe->pipe_flags);
+ if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
+ return -EINVAL;
+
+ p->file_slot = READ_ONCE(sqe->file_index);
+ p->nofile = rlimit(RLIMIT_NOFILE);
+ return 0;
+}
+
+static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
+ unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, fds[2] = { -1, -1 };
+ int slot = p->file_slot;
+
+ if (p->flags & O_CLOEXEC)
+ return -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ ret = __io_fixed_fd_install(ctx, files[0], slot);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+ files[0] = NULL;
+
+ /*
+ * If a specific slot is given, next one will be used for
+ * the write side.
+ */
+ if (slot != IORING_FILE_INDEX_ALLOC)
+ slot++;
+
+ ret = __io_fixed_fd_install(ctx, files[1], slot);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+ files[1] = NULL;
+
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds)))
+ return 0;
+
+ ret = -EFAULT;
+ io_ring_submit_lock(ctx, issue_flags);
+err:
+ if (fds[0] != -1)
+ io_fixed_fd_remove(ctx, fds[0]);
+ if (fds[1] != -1)
+ io_fixed_fd_remove(ctx, fds[1]);
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+
+static int io_pipe_fd(struct io_kiocb *req, struct file **files)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ int ret, fds[2] = { -1, -1 };
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds))) {
+ fd_install(fds[0], files[0]);
+ fd_install(fds[1], files[1]);
+ return 0;
+ }
+ ret = -EFAULT;
+err:
+ if (fds[0] != -1)
+ put_unused_fd(fds[0]);
+ if (fds[1] != -1)
+ put_unused_fd(fds[1]);
+ return ret;
+}
+
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct file *files[2];
+ int ret;
+
+ ret = create_pipe_files(files, p->flags);
+ if (ret)
+ return ret;
+ files[0]->f_mode |= FMODE_NOWAIT;
+ files[1]->f_mode |= FMODE_NOWAIT;
+
+ if (!!p->file_slot)
+ ret = io_pipe_fixed(req, files, issue_flags);
+ else
+ ret = io_pipe_fd(req, files);
+
+ io_req_set_res(req, ret, 0);
+ if (!ret)
+ return IOU_COMPLETE;
+
+ req_set_fail(req);
+ if (files[0])
+ fput(files[0]);
+ if (files[1])
+ fput(files[1]);
+ return ret;
}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 8a93c98ad0ad..4ca2a9935abc 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags);
+
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8eb744eb9f4c..0526062e2f81 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -893,7 +893,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
if (ret > 0) {
io_req_set_res(req, ipt.result_mask, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
return ret ?: IOU_ISSUE_SKIP_COMPLETE;
}
@@ -948,5 +948,5 @@ out:
}
/* complete update request, we're done with it */
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f80a77c4973f..c592ceace97d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-int io_buffer_validate(struct iovec *iov)
+int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+ unsigned long tmp, base = (unsigned long)uaddr;
+ unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
+ /* arbitrary limit, but we need something */
+ if (ulen > SZ_1G || !ulen)
+ return -EFAULT;
+ if (check_add_overflow(base, acct_len, &tmp))
+ return -EOVERFLOW;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
+ return io_validate_user_buf_range((unsigned long)iov->iov_base,
+ iov->iov_len);
}
static void io_release_ubuf(void *priv)
@@ -497,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -685,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
- int nr_pages_left = *nr_pages, i, j;
- int nr_folios = data->nr_folios;
+ unsigned nr_pages_left = *nr_pages;
+ unsigned nr_folios = data->nr_folios;
+ unsigned i, j;
/* Store head pages only*/
- new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
- GFP_KERNEL);
+ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
if (!new_array)
return false;
- new_array[0] = compound_head(page_array[0]);
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- if (data->nr_pages_head > 1)
- unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
-
- j = data->nr_pages_head;
- nr_pages_left -= data->nr_pages_head;
- for (i = 1; i < nr_folios; i++) {
- unsigned int nr_unpin;
-
- new_array[i] = page_array[j];
- nr_unpin = min_t(unsigned int, nr_pages_left - 1,
- data->nr_pages_mid - 1);
- if (nr_unpin)
- unpin_user_pages(&page_array[j+1], nr_unpin);
- j += data->nr_pages_mid;
- nr_pages_left -= data->nr_pages_mid;
+ for (i = 0, j = 0; i < nr_folios; i++) {
+ struct page *p = compound_head(page_array[j]);
+ struct folio *folio = page_folio(p);
+ unsigned int nr;
+
+ WARN_ON_ONCE(i > 0 && p != page_array[j]);
+
+ nr = i ? data->nr_pages_mid : data->nr_pages_head;
+ nr = min(nr, nr_pages_left);
+ /* Drop all but one ref, the entire folio will remain pinned. */
+ if (nr > 1)
+ unpin_user_folio(folio, nr - 1);
+ j += nr;
+ nr_pages_left -= nr;
+ new_array[i] = p;
}
+
+ WARN_ON_ONCE(j != *nr_pages);
+
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
@@ -1062,8 +1061,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
size_t offset;
int ret;
- if (WARN_ON_ONCE(!imu))
- return -EFAULT;
ret = validate_fixed_range(buf_addr, len, imu);
if (unlikely(ret))
return ret;
@@ -1110,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
if (req->flags & REQ_F_BUF_NODE)
return req->buf_node;
+ req->flags |= REQ_F_BUF_NODE;
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
- if (node)
- io_req_assign_buf_node(req, node);
+ if (node) {
+ node->refs++;
+ req->buf_node = node;
+ io_ring_submit_unlock(ctx, issue_flags);
+ return node;
+ }
+ req->flags &= ~REQ_F_BUF_NODE;
io_ring_submit_unlock(ctx, issue_flags);
- return node;
+ return NULL;
}
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index b52242852ff3..0d2138f16322 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
-int io_buffer_validate(struct iovec *iov);
+int io_validate_user_buf_range(u64 uaddr, u64 ulen);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
@@ -115,32 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
return true;
}
-static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
-{
- if (req->file_node) {
- io_put_rsrc_node(req->ctx, req->file_node);
- req->file_node = NULL;
- }
- if (req->flags & REQ_F_BUF_NODE) {
- io_put_rsrc_node(req->ctx, req->buf_node);
- req->buf_node = NULL;
- }
-}
-
-static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
- struct io_rsrc_node *node)
-{
- node->refs++;
- *dst_node = node;
-}
-
-static inline void io_req_assign_buf_node(struct io_kiocb *req,
- struct io_rsrc_node *node)
-{
- io_req_assign_rsrc_node(&req->buf_node, node);
- req->flags |= REQ_F_BUF_NODE;
-}
-
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 039e063f7091..710d8cd53ebb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
return io_import_vec(ddir, req, io, buf, sqe_len);
if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, issue_flags);
+ buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
rw->addr = (unsigned long) buf;
@@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io;
unsigned ioprio;
u64 attr_type_mask;
int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
+ io = req->async_data;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
/* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
+ io->buf_group = req->buf_index;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -276,6 +279,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
+ rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
if (req->ctx->flags & IORING_SETUP_IOPOLL)
rw->kiocb.ki_complete = io_complete_rw_iopoll;
@@ -657,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
io_req_io_end(req);
io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
- return IOU_OK;
+ return IOU_COMPLETE;
} else {
io_rw_done(req, ret);
}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 81d6d9a8cf69..129a53fe5482 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -16,6 +16,8 @@ struct io_async_rw {
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
+ unsigned buf_group;
+
/*
* wpq is for buffered io, while meta fields are used with
* direct io
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7b89bd84d486..35ce4e60b495 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -103,7 +103,7 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -144,5 +144,5 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6bc4651700a2..5111e9befbfe 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_statx_cleanup(struct io_kiocb *req)
diff --git a/io_uring/sync.c b/io_uring/sync.c
index 255f68c37e55..cea2d381ffd2 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
if (ret >= 0)
fsnotify_modify(req->file);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index adc6e42c14df..5b66755579c0 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
data.hash = hash;
data.task = task;
- data.free_work = io_wq_free_work;
- data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2a107665230b..7f13bfa9f2b6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -35,6 +35,9 @@ struct io_timeout_rem {
bool ltimeout;
};
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link);
+
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
@@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
raw_spin_lock_irq(&ctx->timeout_lock);
- link = io_disarm_linked_timeout(req);
+ if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT)
+ link = __io_disarm_linked_timeout(req, req->link);
+
raw_spin_unlock_irq(&ctx->timeout_lock);
if (link)
io_req_queue_tw_complete(link, -ECANCELED);
@@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req)
io_fail_links(req);
}
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link)
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
@@ -500,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_timeout_prep(struct io_kiocb *req,
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index e91b32448dcf..2b7c9ad72992 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -8,19 +8,6 @@ struct io_timeout_data {
u32 flags;
};
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link);
-
-static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link = req->link;
-
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
- return __io_disarm_linked_timeout(req, link);
-
- return NULL;
-}
-
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
diff --git a/io_uring/truncate.c b/io_uring/truncate.c
index 62ee73d34d72..487baf23b44e 100644
--- a/io_uring/truncate.c
+++ b/io_uring/truncate.c
@@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
ret = do_ftruncate(req->file, ft->len, 1);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a9ea7d29cdd9..929cad6ee326 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,13 +3,10 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring/cmd.h>
-#include <linux/io_uring/net.h>
#include <linux/security.h>
#include <linux/nospec.h>
-#include <net/sock.h>
#include <uapi/linux/io_uring.h>
-#include <asm/ioctls.h>
#include "io_uring.h"
#include "alloc_cache.h"
@@ -254,6 +251,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
return -EOPNOTSUPP;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
+ if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+ /* make sure every req only blocks once */
+ req->flags &= ~REQ_F_IOPOLL_STATE;
+ req->iopoll_start = ktime_get_ns();
+ }
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
@@ -263,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_uring_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -273,6 +275,9 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+ return -EINVAL;
+
return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
@@ -287,6 +292,9 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
struct io_async_cmd *ac = req->async_data;
int ret;
+ if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+ return -EINVAL;
+
ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
if (ret)
return ret;
@@ -302,83 +310,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
-
-static inline int io_uring_cmd_getsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optlen, optname, level, err;
- void __user *optval;
-
- level = READ_ONCE(sqe->level);
- if (level != SOL_SOCKET)
- return -EOPNOTSUPP;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
-
- err = do_sock_getsockopt(sock, compat, level, optname,
- USER_SOCKPTR(optval),
- KERNEL_SOCKPTR(&optlen));
- if (err)
- return err;
-
- /* On success, return optlen */
- return optlen;
-}
-
-static inline int io_uring_cmd_setsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optname, optlen, level;
- void __user *optval;
- sockptr_t optval_s;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
- level = READ_ONCE(sqe->level);
- optval_s = USER_SOCKPTR(optval);
-
- return do_sock_setsockopt(sock, compat, level, optname, optval_s,
- optlen);
-}
-
-#if defined(CONFIG_NET)
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
-{
- struct socket *sock = cmd->file->private_data;
- struct sock *sk = sock->sk;
- struct proto *prot = READ_ONCE(sk->sk_prot);
- int ret, arg = 0;
-
- if (!prot || !prot->ioctl)
- return -EOPNOTSUPP;
-
- switch (cmd->cmd_op) {
- case SOCKET_URING_OP_SIOCINQ:
- ret = prot->ioctl(sk, SIOCINQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_SIOCOUTQ:
- ret = prot->ioctl(sk, SIOCOUTQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_GETSOCKOPT:
- return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
- case SOCKET_URING_OP_SETSOCKOPT:
- return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
-#endif
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index b04686b6b5d2..e6a5142c890e 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -17,9 +17,3 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
void io_cmd_cache_free(const void *entry);
-
-int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
- const struct iovec __user *uvec,
- size_t uvec_segs,
- int ddir, struct iov_iter *iter,
- unsigned issue_flags);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 54e69984cd8a..e07a94694397 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -323,5 +323,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index de5064fcae8a..322b94ff9e4b 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_getxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_setxattr_prep(struct io_kiocb *req,
@@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_setxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f30..9a568d049204 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,29 +26,207 @@
#include "zcrx.h"
#include "rsrc.h"
+#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
return pp->mp_priv;
}
-#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
+ return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return area->mem.pages[net_iov_idx(niov)];
+}
+
+static void io_release_dmabuf(struct io_zcrx_mem *mem)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return;
+
+ if (mem->sgt)
+ dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
+ DMA_FROM_DEVICE);
+ if (mem->attach)
+ dma_buf_detach(mem->dmabuf, mem->attach);
+ if (mem->dmabuf)
+ dma_buf_put(mem->dmabuf);
+
+ mem->sgt = NULL;
+ mem->attach = NULL;
+ mem->dmabuf = NULL;
+}
+
+static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ unsigned long off = (unsigned long)area_reg->addr;
+ unsigned long len = (unsigned long)area_reg->len;
+ unsigned long total_size = 0;
+ struct scatterlist *sg;
+ int dmabuf_fd = area_reg->dmabuf_fd;
+ int i, ret;
+
+ if (WARN_ON_ONCE(!ifq->dev))
+ return -EFAULT;
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ mem->is_dmabuf = true;
+ mem->dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(mem->dmabuf)) {
+ ret = PTR_ERR(mem->dmabuf);
+ mem->dmabuf = NULL;
+ goto err;
+ }
+
+ mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
+ if (IS_ERR(mem->attach)) {
+ ret = PTR_ERR(mem->attach);
+ mem->attach = NULL;
+ goto err;
+ }
+
+ mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
+ if (IS_ERR(mem->sgt)) {
+ ret = PTR_ERR(mem->sgt);
+ mem->sgt = NULL;
+ goto err;
+ }
+
+ for_each_sgtable_dma_sg(mem->sgt, sg, i)
+ total_size += sg_dma_len(sg);
+
+ if (total_size < off + len)
+ return -EINVAL;
+
+ mem->dmabuf_offset = off;
+ mem->size = len;
+ return 0;
+err:
+ io_release_dmabuf(mem);
+ return ret;
+}
+
+static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned long off = area->mem.dmabuf_offset;
+ struct scatterlist *sg;
+ unsigned i, niov_idx = 0;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
+ dma_addr_t dma = sg_dma_address(sg);
+ unsigned long sg_len = sg_dma_len(sg);
+ unsigned long sg_off = min(sg_len, off);
+
+ off -= sg_off;
+ sg_len -= sg_off;
+ dma += sg_off;
+
+ while (sg_len && niov_idx < area->nia.num_niovs) {
+ struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+ if (net_mp_niov_set_dma_addr(niov, dma))
+ return 0;
+ sg_len -= PAGE_SIZE;
+ dma += PAGE_SIZE;
+ niov_idx++;
+ }
+ }
+ return niov_idx;
+}
+
+static int io_import_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ struct page **pages;
+ int nr_pages;
+
+ if (area_reg->dmabuf_fd)
+ return -EINVAL;
+ if (!area_reg->addr)
+ return -EFAULT;
+ pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+ &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ mem->pages = pages;
+ mem->nr_folios = nr_pages;
+ mem->size = area_reg->len;
+ return 0;
+}
+
+static void io_release_area_mem(struct io_zcrx_mem *mem)
+{
+ if (mem->is_dmabuf) {
+ io_release_dmabuf(mem);
+ return;
+ }
+ if (mem->pages) {
+ unpin_user_pages(mem->pages, mem->nr_folios);
+ kvfree(mem->pages);
+ }
+}
+
+static int io_import_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ int ret;
+
+ ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
+ if (ret)
+ return ret;
+ if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
+ return io_import_dmabuf(ifq, mem, area_reg);
+ return io_import_umem(ifq, mem, area_reg);
+}
+
+static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
{
int i;
for (i = 0; i < nr_mapped; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- dma_addr_t dma;
+ netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
+ dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
- dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
- net_mp_niov_set_dma_addr(niov, 0);
}
}
+static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
+{
+ int i;
+
+ if (area->mem.is_dmabuf)
+ io_release_dmabuf(&area->mem);
+ else
+ io_zcrx_unmap_umem(ifq, area, nr_mapped);
+
+ for (i = 0; i < area->nia.num_niovs; i++)
+ net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
+}
+
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
guard(mutex)(&ifq->dma_lock);
@@ -58,20 +236,16 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are
area->is_mapped = false;
}
-static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int i;
- guard(mutex)(&ifq->dma_lock);
- if (area->is_mapped)
- return 0;
-
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
- dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
+ dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
if (dma_mapping_error(ifq->dev, dma))
break;
if (net_mp_niov_set_dma_addr(niov, dma)) {
@@ -80,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
break;
}
}
+ return i;
+}
+
+static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned nr;
+
+ guard(mutex)(&ifq->dma_lock);
+ if (area->is_mapped)
+ return 0;
+
+ if (area->mem.is_dmabuf)
+ nr = io_zcrx_map_area_dmabuf(ifq, area);
+ else
+ nr = io_zcrx_map_area_umem(ifq, area);
- if (i != area->nia.num_niovs) {
- __io_zcrx_unmap_area(ifq, area, i);
+ if (nr != area->nia.num_niovs) {
+ __io_zcrx_unmap_area(ifq, area, nr);
return -EINVAL;
}
@@ -118,13 +307,6 @@ struct io_zcrx_args {
static const struct memory_provider_ops io_uring_pp_zc_ops;
-static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
-{
- struct net_iov_area *owner = net_iov_owner(niov);
-
- return container_of(owner, struct io_zcrx_area, nia);
-}
-
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -147,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- return area->pages[net_iov_idx(niov)];
-}
-
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
- struct io_uring_region_desc *rd)
+ struct io_uring_region_desc *rd,
+ u32 id)
{
+ u64 mmap_offset;
size_t off, size;
void *ptr;
int ret;
@@ -167,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
if (size > rd->size)
return -EINVAL;
- ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
- IORING_MAP_OFF_ZCRX_REGION);
+ mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
+ mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+
+ ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
if (ret < 0)
return ret;
- ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+ ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
@@ -180,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+ io_free_region(ifq->ctx, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
@@ -188,53 +367,44 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
io_zcrx_unmap_area(area->ifq, area);
+ io_release_area_mem(&area->mem);
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
- if (area->pages) {
- unpin_user_pages(area->pages, area->nr_folios);
- kvfree(area->pages);
- }
kfree(area);
}
+#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
+
static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area **res,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
- int i, ret, nr_pages, nr_iovs;
- struct iovec iov;
+ unsigned nr_iovs;
+ int i, ret;
- if (area_reg->flags || area_reg->rq_area_token)
+ if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
return -EINVAL;
- if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+ if (area_reg->rq_area_token)
return -EINVAL;
- if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ if (area_reg->__resv2[0] || area_reg->__resv2[1])
return -EINVAL;
- iov.iov_base = u64_to_user_ptr(area_reg->addr);
- iov.iov_len = area_reg->len;
- ret = io_buffer_validate(&iov);
- if (ret)
- return ret;
-
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
- area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
- &nr_pages);
- if (IS_ERR(area->pages)) {
- ret = PTR_ERR(area->pages);
- area->pages = NULL;
+ ret = io_import_area(ifq, &area->mem, area_reg);
+ if (ret)
goto err;
- }
- area->nr_folios = nr_iovs = nr_pages;
+
+ nr_iovs = area->mem.size >> PAGE_SHIFT;
area->nia.num_niovs = nr_iovs;
+ ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
@@ -245,9 +415,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
if (!area->freelist)
goto err;
- for (i = 0; i < nr_iovs; i++)
- area->freelist[i] = i;
-
area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->user_refs)
@@ -341,6 +508,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
kfree(ifq);
}
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
+
+ lockdep_assert_held(&ctx->mmap_lock);
+
+ return ifq ? &ifq->region : NULL;
+}
+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -350,6 +527,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_region_desc rd;
struct io_zcrx_ifq *ifq;
int ret;
+ u32 id;
/*
* 1. Interface queue allocation.
@@ -362,8 +540,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
ctx->flags & IORING_SETUP_CQE32))
return -EINVAL;
- if (ctx->ifq)
- return -EBUSY;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@@ -386,29 +562,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+ ifq->rq_entries = reg.rq_entries;
- ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
- if (ret)
- goto err;
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* preallocate id */
+ ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+ if (ret)
+ goto ifq_free;
+ }
- ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
if (ret)
goto err;
- ifq->rq_entries = reg.rq_entries;
-
- ret = -ENODEV;
ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
&ifq->netdev_tracker, GFP_KERNEL);
- if (!ifq->netdev)
+ if (!ifq->netdev) {
+ ret = -ENODEV;
goto err;
+ }
ifq->dev = ifq->netdev->dev.parent;
- ret = -EOPNOTSUPP;
- if (!ifq->dev)
+ if (!ifq->dev) {
+ ret = -EOPNOTSUPP;
goto err;
+ }
get_device(ifq->dev);
+ ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ if (ret)
+ goto err;
+
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -419,6 +603,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
reg.offsets.rqes = sizeof(struct io_uring);
reg.offsets.head = offsetof(struct io_uring, head);
reg.offsets.tail = offsetof(struct io_uring, tail);
+ reg.zcrx_id = id;
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* publish ifq */
+ ret = -ENOMEM;
+ if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+ goto err;
+ }
if (copy_to_user(arg, &reg, sizeof(reg)) ||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@@ -426,24 +618,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
- ctx->ifq = ifq;
return 0;
err:
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ifq_free:
io_zcrx_ifq_free(ifq);
return ret;
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
- struct io_zcrx_ifq *ifq = ctx->ifq;
+ struct io_zcrx_ifq *ifq;
+ unsigned long id;
lockdep_assert_held(&ctx->uring_lock);
- if (!ifq)
- return;
+ while (1) {
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+ if (ifq)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ }
+ if (!ifq)
+ break;
+ io_zcrx_ifq_free(ifq);
+ }
- ctx->ifq = NULL;
- io_zcrx_ifq_free(ifq);
+ xa_destroy(&ctx->zcrx_ctxs);
}
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@@ -500,12 +702,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
+ struct io_zcrx_ifq *ifq;
+ unsigned long index;
+
lockdep_assert_held(&ctx->uring_lock);
- if (!ctx->ifq)
- return;
- io_zcrx_scrub(ctx->ifq);
- io_close_queue(ctx->ifq);
+ xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
+ io_zcrx_scrub(ifq);
+ io_close_queue(ifq);
+ }
}
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
@@ -742,6 +947,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
size_t copied = 0;
int ret = 0;
+ if (area->mem.is_dmabuf)
+ return -EFAULT;
+
while (len) {
size_t copy_size = min_t(size_t, PAGE_SIZE, len);
const int dst_off = 0;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index f2bc811f022c..2f5e26389f22 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,10 +3,24 @@
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
+#include <linux/dma-buf.h>
#include <linux/socket.h>
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
+struct io_zcrx_mem {
+ unsigned long size;
+ bool is_dmabuf;
+
+ struct page **pages;
+ unsigned long nr_folios;
+
+ struct dma_buf_attachment *attach;
+ struct dma_buf *dmabuf;
+ struct sg_table *sgt;
+ unsigned long dmabuf_offset;
+};
+
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
@@ -14,13 +28,13 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
- struct page **pages;
- unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
u32 free_count;
u32 *freelist;
+
+ struct io_zcrx_mem mem;
};
struct io_zcrx_ifq {
@@ -39,6 +53,7 @@ struct io_zcrx_ifq {
netdevice_tracker netdev_tracker;
spinlock_t lock;
struct mutex dma_lock;
+ struct io_mapped_region region;
};
#if defined(CONFIG_IO_URING_ZCRX)
@@ -49,6 +64,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
@@ -67,6 +84,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
{
return -EOPNOTSUPP;
}
+static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ return NULL;
+}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 35b4f8659904..82ed2d3c9846 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -913,7 +913,7 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
ro = mnt_want_write(mnt); /* we'll drop it in any case */
inode_lock(d_inode(root));
- path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+ path.dentry = lookup_noperm(&QSTR(name->name), root);
if (IS_ERR(path.dentry)) {
error = PTR_ERR(path.dentry);
goto out_putfd;
@@ -969,8 +969,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
if (err)
goto out_name;
inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
- dentry = lookup_one_len(name->name, mnt->mnt_root,
- strlen(name->name));
+ dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dc3aa91a6ba0..5c2e96b19392 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent,
int ret;
inode_lock(parent->d_inode);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry)) {
inode_unlock(parent->d_inode);
return PTR_ERR(dentry);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1b51dc099f1e..c33ecde016de 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -133,8 +133,13 @@ struct release_task_post {
static void __unhash_process(struct release_task_post *post, struct task_struct *p,
bool group_dead)
{
+ struct pid *pid = task_pid(p);
+
nr_threads--;
+
detach_pid(post->pids, p, PIDTYPE_PID);
+ wake_up_all(&pid->wait_pidfd);
+
if (group_dead) {
detach_pid(post->pids, p, PIDTYPE_TGID);
detach_pid(post->pids, p, PIDTYPE_PGID);
@@ -253,7 +258,8 @@ repeat:
pidfs_exit(p);
cgroup_release(p);
- thread_pid = get_pid(p->thread_pid);
+ /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */
+ thread_pid = task_pid(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -282,8 +288,8 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ /* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
- put_pid(thread_pid);
add_device_randomness(&p->se.sum_exec_runtime,
sizeof(p->se.sum_exec_runtime));
free_pids(post.pids);
diff --git a/kernel/fork.c b/kernel/fork.c
index c4b26cd8998b..aad55b0e103c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -498,10 +498,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
- /* track_pfn_copy() will later take care of copying internal state. */
- if (unlikely(new->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(new);
-
return new;
}
@@ -672,6 +668,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp = vm_area_dup(mpnt);
if (!tmp)
goto fail_nomem;
+
+ /* track_pfn_copy() will later take care of copying internal state. */
+ if (unlikely(tmp->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(tmp);
+
retval = vma_dup_policy(mpnt, tmp);
if (retval)
goto fail_nomem_policy;
@@ -2036,17 +2037,16 @@ static inline void rcu_copy_process(struct task_struct *p)
}
/**
- * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @ret: Where to return the file for the pidfd.
+ * @ret_file: return the new pidfs file
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
*
- * The helper doesn't perform checks on @pid which makes it useful for pidfds
- * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
- * pidfd file are prepared.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2063,59 +2063,50 @@ static inline void rcu_copy_process(struct task_struct *p)
* error, a negative error code is returned from the function and the
* last argument remains unchanged.
*/
-static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
- struct file *pidfd_file;
+ struct file *pidfs_file;
+
+ /*
+ * PIDFD_STALE is only allowed to be passed if the caller knows
+ * that @pid is already registered in pidfs and thus
+ * PIDFD_INFO_EXIT information is guaranteed to be available.
+ */
+ if (!(flags & PIDFD_STALE)) {
+ /*
+ * While holding the pidfd waitqueue lock removing the
+ * task linkage for the thread-group leader pid
+ * (PIDTYPE_TGID) isn't possible. Thus, if there's still
+ * task linkage for PIDTYPE_PID not having thread-group
+ * leader linkage for the pid means it wasn't a
+ * thread-group leader in the first place.
+ */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ /* Task has already been reaped. */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ return -ESRCH;
+ /*
+ * If this struct pid isn't used as a thread-group
+ * leader but the caller requested to create a
+ * thread-group leader pidfd then report ENOENT.
+ */
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
+ return -ENOENT;
+ }
CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
- if (IS_ERR(pidfd_file))
- return PTR_ERR(pidfd_file);
+ pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
- *ret = pidfd_file;
+ *ret_file = pidfs_file;
return take_fd(pidfd);
}
-/**
- * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
- * @pid: the struct pid for which to create a pidfd
- * @flags: flags of the new @pidfd
- * @ret: Where to return the pidfd.
- *
- * Allocate a new file that stashes @pid and reserve a new pidfd number in the
- * caller's file descriptor table. The pidfd is reserved but not installed yet.
- *
- * The helper verifies that @pid is still in use, without PIDFD_THREAD the
- * task identified by @pid must be a thread-group leader.
- *
- * If this function returns successfully the caller is responsible to either
- * call fd_install() passing the returned pidfd and pidfd file as arguments in
- * order to install the pidfd into its file descriptor table or they must use
- * put_unused_fd() and fput() on the returned pidfd and pidfd file
- * respectively.
- *
- * This function is useful when a pidfd must already be reserved but there
- * might still be points of failure afterwards and the caller wants to ensure
- * that no pidfd is leaked into its file descriptor table.
- *
- * Return: On success, a reserved pidfd is returned from the function and a new
- * pidfd file is returned in the last argument to the function. On
- * error, a negative error code is returned from the function and the
- * last argument remains unchanged.
- */
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
-{
- bool thread = flags & PIDFD_THREAD;
-
- if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
- return -EINVAL;
-
- return __pidfd_prepare(pid, flags, ret);
-}
-
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2462,7 +2453,7 @@ __latent_entropy struct task_struct *copy_process(
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
+ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index d6964fc29f51..ef234469baac 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
return !reader; /* wake (readers until) 1 writer */
}
-static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader,
+ bool freeze)
{
DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
bool wait;
@@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
spin_unlock_irq(&sem->waiters.lock);
while (wait) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE |
+ (freeze ? TASK_FREEZABLE : 0));
if (!smp_load_acquire(&wq_entry.private))
break;
schedule();
@@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try,
+ bool freeze)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
- percpu_rwsem_wait(sem, /* .reader = */ true);
+ percpu_rwsem_wait(sem, /* .reader = */ true, freeze);
preempt_disable();
trace_contention_end(sem, 0);
@@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
*/
if (!__percpu_down_write_trylock(sem)) {
trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
- percpu_rwsem_wait(sem, /* .reader = */ false);
+ percpu_rwsem_wait(sem, /* .reader = */ false, false);
contended = true;
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a2859dc3eea6..5c6ab20240a6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2829,6 +2829,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
+ codetag_free_module_sections(mod);
free_mod_mem(mod);
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c9d97ed20122..5f31fdff8a38 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -128,17 +128,13 @@ out_time:
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
- if (new_nsp->pid_ns_for_children)
- put_pid_ns(new_nsp->pid_ns_for_children);
+ put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
- if (new_nsp->ipc_ns)
- put_ipc_ns(new_nsp->ipc_ns);
+ put_ipc_ns(new_nsp->ipc_ns);
out_ipc:
- if (new_nsp->uts_ns)
- put_uts_ns(new_nsp->uts_ns);
+ put_uts_ns(new_nsp->uts_ns);
out_uts:
- if (new_nsp->mnt_ns)
- put_mnt_ns(new_nsp->mnt_ns);
+ put_mnt_ns(new_nsp->mnt_ns);
out_ns:
kmem_cache_free(nsproxy_cachep, new_nsp);
return ERR_PTR(err);
@@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
void free_nsproxy(struct nsproxy *ns)
{
- if (ns->mnt_ns)
- put_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- put_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- put_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns_for_children)
- put_pid_ns(ns->pid_ns_for_children);
- if (ns->time_ns)
- put_time_ns(ns->time_ns);
- if (ns->time_ns_for_children)
- put_time_ns(ns->time_ns_for_children);
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
diff --git a/kernel/padata.c b/kernel/padata.c
index b3d4eacc4f5d..7eee94166357 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd)
* To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
*/
padata_get_pd(pd);
- queue_work(pinst->serial_wq, &pd->reorder_work);
+ if (!queue_work(pinst->serial_wq, &pd->reorder_work))
+ padata_put_pd(pd);
}
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4ac2ce46817f..8317bcbc7cf7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,6 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
+ WARN_ON_ONCE(pid->stashed);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -359,11 +360,6 @@ static void __change_pid(struct pid **pids, struct task_struct *task,
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
- if (type == PIDTYPE_PID) {
- WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
- wake_up_all(&pid->wait_pidfd);
- }
-
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 23c0f4e6cb2f..338c9917d4ee 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -778,6 +778,8 @@ int hibernate(void)
goto Restore;
ksys_sync_helper();
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
error = freeze_processes();
if (error)
@@ -846,6 +848,7 @@ int hibernate(void)
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_HIBERNATION);
Restore:
pm_restore_console();
@@ -882,6 +885,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
+
error = freeze_processes();
if (error)
goto exit;
@@ -941,6 +947,7 @@ thaw:
thaw_processes();
exit:
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_HIBERNATION);
restore:
@@ -1029,19 +1036,26 @@ static int software_resume(void)
if (error)
goto Restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
+
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
- if (error)
+ if (error) {
+ filesystems_thaw();
goto Close_Finish;
+ }
error = freeze_kernel_threads();
if (error) {
thaw_processes();
+ filesystems_thaw();
goto Close_Finish;
}
error = load_image_and_restore();
thaw_processes();
+ filesystems_thaw();
Finish:
pm_notifier_call_chain(PM_POST_RESTORE);
Restore:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6254814d4817..0b0e76324c43 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -962,6 +962,34 @@ power_attr(pm_freeze_timeout);
#endif /* CONFIG_FREEZER*/
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+bool filesystem_freeze_enabled = false;
+
+static ssize_t freeze_filesystems_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled);
+}
+
+static ssize_t freeze_filesystems_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ if (val > 1)
+ return -EINVAL;
+
+ filesystem_freeze_enabled = !!val;
+ return n;
+}
+
+power_attr(freeze_filesystems);
+#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -992,6 +1020,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_FREEZER
&pm_freeze_timeout_attr.attr,
#endif
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+ &freeze_filesystems_attr.attr,
+#endif
NULL,
};
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c352dea2f67b..2eb81662b8fa 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -18,6 +18,10 @@ struct swsusp_info {
unsigned long size;
} __aligned(PAGE_SIZE);
+#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
+extern bool filesystem_freeze_enabled;
+#endif
+
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
extern void __init hibernate_reserved_size_init(void);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8eaec4ab121d..76b141b9aac0 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,6 +30,7 @@
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/fs.h>
#include "power.h"
@@ -374,6 +375,8 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Restore;
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
@@ -550,6 +553,7 @@ int suspend_devices_and_enter(suspend_state_t state)
static void suspend_finish(void)
{
suspend_thaw_processes();
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
@@ -588,6 +592,8 @@ static int enter_state(suspend_state_t state)
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
}
+ if (filesystem_freeze_enabled)
+ filesystems_freeze();
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
@@ -609,6 +615,7 @@ static int enter_state(suspend_state_t state)
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
+ filesystems_thaw();
mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 80ff5f933a62..ad13c461b657 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -268,35 +268,26 @@ static void hib_end_io(struct bio *bio)
bio_put(bio);
}
-static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr,
+static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr)
+{
+ return bdev_rw_virt(file_bdev(hib_resume_bdev_file),
+ page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf);
+}
+
+static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr,
struct hib_bio_batch *hb)
{
- struct page *page = virt_to_page(addr);
struct bio *bio;
- int error = 0;
bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf,
GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
-
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- pr_err("Adding page to bio failed at %llu\n",
- (unsigned long long)bio->bi_iter.bi_sector);
- bio_put(bio);
- return -EFAULT;
- }
-
- if (hb) {
- bio->bi_end_io = hib_end_io;
- bio->bi_private = hb;
- atomic_inc(&hb->count);
- submit_bio(bio);
- } else {
- error = submit_bio_wait(bio);
- bio_put(bio);
- }
-
- return error;
+ bio_add_virt_nofail(bio, addr, PAGE_SIZE);
+ bio->bi_end_io = hib_end_io;
+ bio->bi_private = hb;
+ atomic_inc(&hb->count);
+ submit_bio(bio);
+ return 0;
}
static int hib_wait_io(struct hib_bio_batch *hb)
@@ -316,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
{
int error;
- hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
!memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -329,8 +320,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
swsusp_header->flags = flags;
if (flags & SF_CRC32_MODE)
swsusp_header->crc32 = handle->crc32;
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
- swsusp_resume_block, swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
+ swsusp_resume_block, swsusp_header);
} else {
pr_err("Swap header not found!\n");
error = -ENODEV;
@@ -380,36 +371,30 @@ static int swsusp_swap_check(void)
static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
{
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
void *src;
int ret;
if (!offset)
return -ENOSPC;
- if (hb) {
- src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- ret = hib_wait_io(hb); /* Free pages */
- if (ret)
- return ret;
- src = (void *)__get_free_page(GFP_NOIO |
- __GFP_NOWARN |
- __GFP_NORETRY);
- if (src) {
- copy_page(src, buf);
- } else {
- WARN_ON_ONCE(1);
- hb = NULL; /* Go synchronous */
- src = buf;
- }
- }
- } else {
- src = buf;
+ if (!hb)
+ goto sync_io;
+
+ src = (void *)__get_free_page(gfp);
+ if (!src) {
+ ret = hib_wait_io(hb); /* Free pages */
+ if (ret)
+ return ret;
+ src = (void *)__get_free_page(gfp);
+ if (WARN_ON_ONCE(!src))
+ goto sync_io;
}
- return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
+
+ copy_page(src, buf);
+ return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb);
+sync_io:
+ return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf);
}
static void release_swap_writer(struct swap_map_handle *handle)
@@ -1041,7 +1026,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
return -ENOMEM;
}
- error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map);
if (error) {
release_swap_reader(handle);
return error;
@@ -1065,7 +1050,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
offset = handle->cur->entries[handle->k];
if (!offset)
return -EFAULT;
- error = hib_submit_io(REQ_OP_READ, offset, buf, hb);
+ if (hb)
+ error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb);
+ else
+ error = hib_submit_io_sync(REQ_OP_READ, offset, buf);
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1590,8 +1578,8 @@ int swsusp_check(bool exclusive)
BLK_OPEN_READ, holder, NULL);
if (!IS_ERR(hib_resume_bdev_file)) {
clear_page(swsusp_header);
- error = hib_submit_io(REQ_OP_READ, swsusp_resume_block,
- swsusp_header, NULL);
+ error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block,
+ swsusp_header);
if (error)
goto put;
@@ -1599,9 +1587,9 @@ int swsusp_check(bool exclusive)
memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
swsusp_header_flags = swsusp_header->flags;
/* Reset swap signature now */
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
- swsusp_header, NULL);
+ swsusp_header);
} else {
error = -EINVAL;
}
@@ -1650,13 +1638,12 @@ int swsusp_unmark(void)
{
int error;
- hib_submit_io(REQ_OP_READ, swsusp_resume_block,
- swsusp_header, NULL);
+ hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header);
if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
- error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC,
+ error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC,
swsusp_resume_block,
- swsusp_header, NULL);
+ swsusp_header);
} else {
pr_err("Cannot find swsusp signature!\n");
error = -ENODEV;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3679a6d18934..3f6a7bdc6edf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -893,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
rcu_read_unlock();
}
-static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio)
-{
- blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0);
-}
-
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio)
{
@@ -1089,8 +1084,6 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
- WARN_ON(ret);
ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
@@ -1125,7 +1118,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL);
@@ -1462,7 +1454,6 @@ static const struct {
[__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
[__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
[__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
- [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
[__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
};
@@ -1896,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'S';
if (opf & REQ_META)
rwbs[i++] = 'M';
+ if (opf & REQ_ATOMIC)
+ rwbs[i++] = 'U';
rwbs[i] = '\0';
}
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 25ecc1334b67..c7f602fa7b23 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -350,18 +350,28 @@ static bool needs_section_mem(struct module *mod, unsigned long size)
return size >= sizeof(struct alloc_tag);
}
-static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to)
+static bool clean_unused_counters(struct alloc_tag *start_tag,
+ struct alloc_tag *end_tag)
{
- while (from <= to) {
+ struct alloc_tag *tag;
+ bool ret = true;
+
+ for (tag = start_tag; tag <= end_tag; tag++) {
struct alloc_tag_counters counter;
- counter = alloc_tag_read(from);
- if (counter.bytes)
- return from;
- from++;
+ if (!tag->counters)
+ continue;
+
+ counter = alloc_tag_read(tag);
+ if (!counter.bytes) {
+ free_percpu(tag->counters);
+ tag->counters = NULL;
+ } else {
+ ret = false;
+ }
}
- return NULL;
+ return ret;
}
/* Called with mod_area_mt locked */
@@ -371,12 +381,16 @@ static void clean_unused_module_areas_locked(void)
struct module *val;
mas_for_each(&mas, val, module_tags.size) {
+ struct alloc_tag *start_tag;
+ struct alloc_tag *end_tag;
+
if (val != &unloaded_mod)
continue;
/* Release area if all tags are unused */
- if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
- (struct alloc_tag *)(module_tags.start_addr + mas.last)))
+ start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
+ end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
+ if (clean_unused_counters(start_tag, end_tag))
mas_erase(&mas);
}
}
@@ -561,7 +575,8 @@ unlock:
static void release_module_tags(struct module *mod, bool used)
{
MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size);
- struct alloc_tag *tag;
+ struct alloc_tag *start_tag;
+ struct alloc_tag *end_tag;
struct module *val;
mas_lock(&mas);
@@ -575,15 +590,22 @@ static void release_module_tags(struct module *mod, bool used)
if (!used)
goto release_area;
- /* Find out if the area is used */
- tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index),
- (struct alloc_tag *)(module_tags.start_addr + mas.last));
- if (tag) {
- struct alloc_tag_counters counter = alloc_tag_read(tag);
+ start_tag = (struct alloc_tag *)(module_tags.start_addr + mas.index);
+ end_tag = (struct alloc_tag *)(module_tags.start_addr + mas.last);
+ if (!clean_unused_counters(start_tag, end_tag)) {
+ struct alloc_tag *tag;
+
+ for (tag = start_tag; tag <= end_tag; tag++) {
+ struct alloc_tag_counters counter;
+
+ if (!tag->counters)
+ continue;
- pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
- tag->ct.filename, tag->ct.lineno, tag->ct.modname,
- tag->ct.function, counter.bytes);
+ counter = alloc_tag_read(tag);
+ pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n",
+ tag->ct.filename, tag->ct.lineno, tag->ct.modname,
+ tag->ct.function, counter.bytes);
+ }
} else {
used = false;
}
@@ -596,6 +618,34 @@ out:
mas_unlock(&mas);
}
+static void load_module(struct module *mod, struct codetag *start, struct codetag *stop)
+{
+ /* Allocate module alloc_tag percpu counters */
+ struct alloc_tag *start_tag;
+ struct alloc_tag *stop_tag;
+ struct alloc_tag *tag;
+
+ if (!mod)
+ return;
+
+ start_tag = ct_to_alloc_tag(start);
+ stop_tag = ct_to_alloc_tag(stop);
+ for (tag = start_tag; tag < stop_tag; tag++) {
+ WARN_ON(tag->counters);
+ tag->counters = alloc_percpu(struct alloc_tag_counters);
+ if (!tag->counters) {
+ while (--tag >= start_tag) {
+ free_percpu(tag->counters);
+ tag->counters = NULL;
+ }
+ shutdown_mem_profiling(true);
+ pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n",
+ mod->name);
+ break;
+ }
+ }
+}
+
static void replace_module(struct module *mod, struct module *new_mod)
{
MA_STATE(mas, &mod_area_mt, 0, module_tags.size);
@@ -757,6 +807,7 @@ static int __init alloc_tag_init(void)
.needs_section_mem = needs_section_mem,
.alloc_section_mem = reserve_module_tags,
.free_section_mem = release_module_tags,
+ .module_load = load_module,
.module_replaced = replace_module,
#endif
};
diff --git a/lib/codetag.c b/lib/codetag.c
index 42aadd6c1454..de332e98d6f5 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -194,7 +194,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
if (err >= 0) {
cttype->count += range_size(cttype, &range);
if (cttype->desc.module_load)
- cttype->desc.module_load(cttype, cmod);
+ cttype->desc.module_load(mod, range.start, range.stop);
}
up_write(&cttype->mod_lock);
@@ -333,7 +333,8 @@ void codetag_unload_module(struct module *mod)
}
if (found) {
if (cttype->desc.module_unload)
- cttype->desc.module_unload(cttype, cmod);
+ cttype->desc.module_unload(cmod->mod,
+ cmod->range.start, cmod->range.stop);
cttype->count -= range_size(cttype, &cmod->range);
idr_remove(&cttype->mod_idr, mod_id);
diff --git a/mm/cma.c b/mm/cma.c
index 15632939f20a..c04be488b099 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -608,7 +608,10 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
* complain. Find the boundary by adding one to the last valid
* address.
*/
- highmem_start = __pa(high_memory - 1) + 1;
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ highmem_start = __pa(high_memory - 1) + 1;
+ else
+ highmem_start = memblock_end_of_DRAM();
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5be8cc1c6529 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6ea1be71aa42..6a3cf7935c14 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1250,7 +1250,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -2949,12 +2949,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
+
+ /*
+ * The folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
+ spin_lock_irq(&hugetlb_lock);
if (folio_test_hugetlb(folio)) {
h = folio_hstate(folio);
} else {
+ spin_unlock_irq(&hugetlb_lock);
start_pfn++;
continue;
}
+ spin_unlock_irq(&hugetlb_lock);
if (!folio_ref_count(folio)) {
ret = alloc_and_dissolve_hugetlb_folio(h, folio,
@@ -3010,7 +3018,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long retval, gbl_chg;
+ long retval, gbl_chg, gbl_reserve;
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
@@ -3163,8 +3171,16 @@ out_uncharge_cgroup_reservation:
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
if (map_chg != MAP_CHG_ENFORCED)
vma_end_reservation(h, vma, addr);
@@ -7239,7 +7255,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7374,8 +7390,16 @@ bool hugetlb_reserve_pages(struct inode *inode,
return true;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
@@ -7915,3 +7939,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}
+
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
+}
diff --git a/mm/internal.h b/mm/internal.h
index 25a29872c634..5c7a2b43ad76 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1590,7 +1590,6 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
-void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..d2c70cd2afb1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ return 0;
+}
+
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
return 0;
}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ data.start = start;
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
{
unsigned long shadow_start, shadow_end;
@@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
if (ret)
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c96c1f2b9cf5..2d4d65f25fec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1168,7 +1168,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1178,10 +1177,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index ba3ea0a82f7f..49199410805c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3751,7 +3751,7 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
/* Stabilize the mapcount vs. refcount and recheck. */
folio_lock_large_mapcount(folio);
- VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
if (folio_test_large_maybe_mapped_shared(folio))
goto unlock;
diff --git a/mm/migrate.c b/mm/migrate.c
index 676d9cfc7059..c80591514e66 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -947,66 +947,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 327764ca0ee4..eedce9321e13 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1441,7 +1441,6 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
- INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work);
#endif
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 7db9da609c84..0d4948b720e2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1188,8 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma))
- clear_vma_resv_huge_pages(vma);
+ fixup_hugetlb_reservations(vma);
/* Tell pfnmap has moved from this vma */
if (unlikely(vma->vm_flags & VM_PFNMAP))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c81624bc3969..76200cd85fe7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2621,27 +2621,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2652,14 +2631,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5669baf2a6fe..47fa713ccb4d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -1151,14 +1152,9 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
__pgalloc_tag_sub(page, nr);
}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
+/* When tag is not NULL, assuming mem_alloc_profiling_enabled */
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
{
- struct alloc_tag *tag;
-
- if (!mem_alloc_profiling_enabled())
- return;
-
- tag = __pgalloc_tag_get(page);
if (tag)
this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
}
@@ -1168,7 +1164,7 @@ static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr)
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
-static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
@@ -3616,7 +3612,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3643,7 +3639,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3696,7 +3692,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -4566,6 +4562,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -4849,7 +4853,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4858,7 +4862,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -5065,11 +5069,13 @@ static void ___free_pages(struct page *page, unsigned int order,
{
/* get PageHead before we drop reference */
int head = PageHead(page);
+ /* get alloc tag in case the page is released by others */
+ struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
__free_frozen_pages(page, order, fpi_flags);
else if (!head) {
- pgalloc_tag_sub_pages(page, (1 << order) - 1);
+ pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
__free_frozen_pages(page + (1 << order), order,
fpi_flags);
@@ -7174,16 +7180,8 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
-void unaccepted_cleanup_work(struct work_struct *work)
-{
- static_branch_dec(&zones_with_unaccepted_pages);
-}
-
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
@@ -7208,11 +7206,7 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7221,28 +7215,6 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last) {
- /*
- * There are two corner cases:
- *
- * - If allocation occurs during the CPU bring up,
- * static_branch_dec() cannot be used directly as
- * it causes a deadlock on cpu_hotplug_lock.
- *
- * Instead, use schedule_work() to prevent deadlock.
- *
- * - If allocation occurs before workqueues are initialized,
- * static_branch_dec() should be called directly.
- *
- * Workqueues are initialized before CPU bring up, so this
- * will not conflict with the first scenario.
- */
- if (system_wq)
- schedule_work(&zone->unaccepted_cleanup);
- else
- unaccepted_cleanup_work(&zone->unaccepted_cleanup);
- }
}
void accept_page(struct page *page)
@@ -7279,20 +7251,17 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
+ if (list_empty(&zone->unaccepted_pages))
return false;
- if (list_empty(&zone->unaccepted_pages))
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
return false;
wmark = promo_wmark_pages(zone);
@@ -7325,22 +7294,17 @@ static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
@@ -7351,7 +7315,8 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7422,11 +7387,6 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order)
if (!pcp_allowed_order(order))
return NULL;
-#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Bailout, since try_to_accept_memory_one() needs to take a lock */
- if (has_unaccepted_memory())
- return NULL;
-#endif
/* Bailout, since _deferred_grow_zone() needs to take a lock */
if (deferred_pages_enabled())
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..f7716b6569fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
int ret;
if (folio_free_swap(folio)) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 6a4e96b69702..20d36d6b055e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/shmem.c b/mm/shmem.c
index 99327c30507c..858cee02ca49 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -107,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -446,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
@@ -1536,12 +1536,15 @@ int shmem_unuse(unsigned int type)
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @wbc: How writeback is to be done
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1550,13 +1553,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
@@ -1586,9 +1582,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, wbc->list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1646,7 +1641,7 @@ try_split:
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
+ return swap_writeout(folio, wbc);
}
list_del_init(&info->swaplist);
@@ -1660,6 +1655,7 @@ redirty:
folio_unlock(folio);
return 0;
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -3768,7 +3764,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -5191,7 +5187,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 6af13bcd2ab3..5acb51a9fc49 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -223,7 +223,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -341,7 +341,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/swap.h b/mm/swap.h
index 6f4a3f927edb..aa62463976d5 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -20,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct writeback_control *wbc);
void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
@@ -141,7 +141,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *f, struct writeback_control *wbc)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 68fd981b514f..ec2b1c9c9926 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 412ccd6543b3..86643b181098 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2368,7 +2368,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
diff --git a/mm/truncate.c b/mm/truncate.c
index 5d98054094d1..f2aaf99f2990 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
@@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
return true;
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- split_at2 = folio_page(folio,
- PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
-
if (!try_folio_split(folio, split_at, NULL)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
* for shmem truncate
*/
- struct folio *folio2 = page_folio(split_at2);
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
if (!folio_try_get(folio2))
goto no_split;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7d5d709cc838..e0db855c89b4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1064,8 +1064,13 @@ static int move_present_pte(struct mm_struct *mm,
src_folio->index = linear_page_index(dst_vma, dst_addr);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1100,6 +1105,9 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
diff --git a/mm/vma.c b/mm/vma.c
index 839d12f02c88..a468d4c29c0c 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1834,6 +1834,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2d7511654831..00cf1b575c89 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4093,8 +4093,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out "freed" memory. */
- if (want_init_on_free())
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
@@ -4107,10 +4107,13 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (size <= alloced_size) {
kasan_unpoison_vmalloc(p + old_size, size - old_size,
KASAN_VMALLOC_PROT_NORMAL);
- /* Zero out "alloced" memory. */
- if (want_init_on_alloc(flags))
- memset((void *)p + old_size, 0, size - old_size);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
vm->requested_size = size;
+ return (void *)p;
}
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3783e45bfc92..b6f4db6c240f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -648,21 +648,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -685,7 +684,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -708,7 +711,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -717,7 +720,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 961b270f023c..d14a7e317ac8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1243,19 +1243,19 @@ void zs_obj_write(struct zs_pool *pool, unsigned long handle,
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
- if (off + class->size <= PAGE_SIZE) {
+ if (!ZsHugePage(zspage))
+ off += ZS_HANDLE_SIZE;
+
+ if (off + mem_len <= PAGE_SIZE) {
/* this object is contained entirely within a page */
void *dst = kmap_local_zpdesc(zpdesc);
- if (!ZsHugePage(zspage))
- off += ZS_HANDLE_SIZE;
memcpy(dst + off, handle_mem, mem_len);
kunmap_local(dst);
} else {
/* this object spans two pages */
size_t sizes[2];
- off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = mem_len - sizes[0];
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 7cd4bdcee439..558d39dffc23 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -506,28 +506,32 @@ batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface)
return false;
}
-static void batadv_check_known_mac_addr(const struct net_device *net_dev)
+static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface)
{
- const struct batadv_hard_iface *hard_iface;
+ const struct net_device *mesh_iface = hard_iface->mesh_iface;
+ const struct batadv_hard_iface *tmp_hard_iface;
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->if_status != BATADV_IF_ACTIVE &&
- hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
+ if (!mesh_iface)
+ return;
+
+ list_for_each_entry(tmp_hard_iface, &batadv_hardif_list, list) {
+ if (tmp_hard_iface == hard_iface)
+ continue;
+
+ if (tmp_hard_iface->mesh_iface != mesh_iface)
continue;
- if (hard_iface->net_dev == net_dev)
+ if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE)
continue;
- if (!batadv_compare_eth(hard_iface->net_dev->dev_addr,
- net_dev->dev_addr))
+ if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr,
+ hard_iface->net_dev->dev_addr))
continue;
pr_warn("The newly added mac address (%pM) already exists on: %s\n",
- net_dev->dev_addr, hard_iface->net_dev->name);
+ hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name);
pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n");
}
- rcu_read_unlock();
}
/**
@@ -763,6 +767,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
hard_iface->net_dev->name, hardif_mtu,
required_mtu);
+ batadv_check_known_mac_addr(hard_iface);
+
if (batadv_hardif_is_iface_up(hard_iface))
batadv_hardif_activate_interface(hard_iface);
else
@@ -901,7 +907,6 @@ batadv_hardif_add_interface(struct net_device *net_dev)
batadv_v_hardif_init(hard_iface);
- batadv_check_known_mac_addr(hard_iface->net_dev);
kref_get(&hard_iface->refcount);
list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
batadv_hardif_generation++;
@@ -988,7 +993,7 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
goto hardif_put;
- batadv_check_known_mac_addr(hard_iface->net_dev);
+ batadv_check_known_mac_addr(hard_iface);
bat_priv = netdev_priv(hard_iface->mesh_iface);
bat_priv->algo_ops->iface.update_mac(hard_iface);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 6533e281ada3..946d2ae551f8 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -3023,3 +3023,27 @@ void hci_conn_tx_dequeue(struct hci_conn *conn)
kfree_skb(skb);
}
+
+u8 *hci_conn_key_enc_size(struct hci_conn *conn)
+{
+ if (conn->type == ACL_LINK) {
+ struct link_key *key;
+
+ key = hci_find_link_key(conn->hdev, &conn->dst);
+ if (!key)
+ return NULL;
+
+ return &key->pin_len;
+ } else if (conn->type == LE_LINK) {
+ struct smp_ltk *ltk;
+
+ ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type,
+ conn->role);
+ if (!ltk)
+ return NULL;
+
+ return &ltk->enc_size;
+ }
+
+ return NULL;
+}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6d6061111ac5..c38ada69c3d7 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -739,10 +739,17 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data,
handle);
conn->enc_key_size = 0;
} else {
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
conn->enc_key_size = rp->key_size;
status = 0;
- if (conn->enc_key_size < hdev->min_enc_key_size) {
+ /* Attempt to check if the key size is too small or if it has
+ * been downgraded from the last time it was stored as part of
+ * the link_key.
+ */
+ if (conn->enc_key_size < hdev->min_enc_key_size ||
+ (key_enc_size && conn->enc_key_size < *key_enc_size)) {
/* As slave role, the conn->state has been set to
* BT_CONNECTED and l2cap conn req might not be received
* yet, at this moment the l2cap layer almost does
@@ -755,6 +762,10 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data,
clear_bit(HCI_CONN_ENCRYPT, &conn->flags);
clear_bit(HCI_CONN_AES_CCM, &conn->flags);
}
+
+ /* Update the key encryption size with the connection one */
+ if (key_enc_size && *key_enc_size != conn->enc_key_size)
+ *key_enc_size = conn->enc_key_size;
}
hci_encrypt_cfm(conn, status);
@@ -3065,6 +3076,34 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata,
hci_dev_unlock(hdev);
}
+static int hci_read_enc_key_size(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ struct hci_cp_read_enc_key_size cp;
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
+ if (!read_key_size_capable(hdev)) {
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ return -EOPNOTSUPP;
+ }
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ /* If the key enc_size is already known, use it as conn->enc_key_size,
+ * otherwise use hdev->min_enc_key_size so the likes of
+ * l2cap_check_enc_key_size don't fail while waiting for
+ * HCI_OP_READ_ENC_KEY_SIZE response.
+ */
+ if (key_enc_size && *key_enc_size)
+ conn->enc_key_size = *key_enc_size;
+ else
+ conn->enc_key_size = hdev->min_enc_key_size;
+
+ return hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
+}
+
static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
@@ -3157,23 +3196,11 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
ev->link_type == ACL_LINK) {
struct link_key *key;
- struct hci_cp_read_enc_key_size cp;
key = hci_find_link_key(hdev, &ev->bdaddr);
if (key) {
set_bit(HCI_CONN_ENCRYPT, &conn->flags);
-
- if (!read_key_size_capable(hdev)) {
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
- } else {
- cp.handle = cpu_to_le16(conn->handle);
- if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE,
- sizeof(cp), &cp)) {
- bt_dev_err(hdev, "sending read key size failed");
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
- }
- }
-
+ hci_read_enc_key_size(hdev, conn);
hci_encrypt_cfm(conn, ev->status);
}
}
@@ -3612,24 +3639,8 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
/* Try reading the encryption key size for encrypted ACL links */
if (!ev->status && ev->encrypt && conn->type == ACL_LINK) {
- struct hci_cp_read_enc_key_size cp;
-
- /* Only send HCI_Read_Encryption_Key_Size if the
- * controller really supports it. If it doesn't, assume
- * the default size (16).
- */
- if (!read_key_size_capable(hdev)) {
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
- goto notify;
- }
-
- cp.handle = cpu_to_le16(conn->handle);
- if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE,
- sizeof(cp), &cp)) {
- bt_dev_err(hdev, "sending read key size failed");
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ if (hci_read_enc_key_size(hdev, conn))
goto notify;
- }
goto unlock;
}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 73472756618a..042d3ac3b4a3 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1411,7 +1411,8 @@ static void l2cap_request_info(struct l2cap_conn *conn)
sizeof(req), &req);
}
-static bool l2cap_check_enc_key_size(struct hci_conn *hcon)
+static bool l2cap_check_enc_key_size(struct hci_conn *hcon,
+ struct l2cap_chan *chan)
{
/* The minimum encryption key size needs to be enforced by the
* host stack before establishing any L2CAP connections. The
@@ -1425,7 +1426,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon)
int min_key_size = hcon->hdev->min_enc_key_size;
/* On FIPS security level, key size must be 16 bytes */
- if (hcon->sec_level == BT_SECURITY_FIPS)
+ if (chan->sec_level == BT_SECURITY_FIPS)
min_key_size = 16;
return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) ||
@@ -1453,7 +1454,7 @@ static void l2cap_do_start(struct l2cap_chan *chan)
!__l2cap_no_conn_pending(chan))
return;
- if (l2cap_check_enc_key_size(conn->hcon))
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
l2cap_start_connection(chan);
else
__set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
@@ -1528,7 +1529,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
continue;
}
- if (l2cap_check_enc_key_size(conn->hcon))
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
l2cap_start_connection(chan);
else
l2cap_chan_close(chan, ECONNREFUSED);
@@ -3992,7 +3993,7 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
/* Check if the ACL is secure enough (if not SDP) */
if (psm != cpu_to_le16(L2CAP_PSM_SDP) &&
(!hci_conn_check_link_mode(conn->hcon) ||
- !l2cap_check_enc_key_size(conn->hcon))) {
+ !l2cap_check_enc_key_size(conn->hcon, pchan))) {
conn->disc_reason = HCI_ERROR_AUTH_FAILURE;
result = L2CAP_CR_SEC_BLOCK;
goto response;
@@ -7352,7 +7353,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
}
if (chan->state == BT_CONNECT) {
- if (!status && l2cap_check_enc_key_size(hcon))
+ if (!status && l2cap_check_enc_key_size(hcon, chan))
l2cap_start_connection(chan);
else
__set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
@@ -7362,7 +7363,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
struct l2cap_conn_rsp rsp;
__u16 res, stat;
- if (!status && l2cap_check_enc_key_size(hcon)) {
+ if (!status && l2cap_check_enc_key_size(hcon, chan)) {
if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
res = L2CAP_CR_PEND;
stat = L2CAP_CS_AUTHOR_PEND;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index c1e1e529e26c..46b22708dfbd 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -7506,11 +7506,16 @@ static void add_device_complete(struct hci_dev *hdev, void *data, int err)
struct mgmt_cp_add_device *cp = cmd->param;
if (!err) {
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type,
cp->action);
device_flags_changed(NULL, hdev, &cp->addr.bdaddr,
cp->addr.type, hdev->conn_flags,
- PTR_UINT(cmd->user_data));
+ params ? params->flags : 0);
}
mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE,
@@ -7613,8 +7618,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- cmd->user_data = UINT_PTR(current_flags);
-
err = hci_cmd_sync_queue(hdev, add_device_sync, cmd,
add_device_complete);
if (err < 0) {
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 98aea5485aae..a8c67035e23c 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -65,17 +65,14 @@ static struct dst_ops fake_dst_ops = {
* ipt_REJECT needs it. Future netfilter modules might
* require us to fill additional fields.
*/
-static const u32 br_dst_default_metrics[RTAX_MAX] = {
- [RTAX_MTU - 1] = 1500,
-};
-
void br_netfilter_rtable_init(struct net_bridge *br)
{
struct rtable *rt = &br->fake_rtable;
rcuref_init(&rt->dst.__rcuref, 1);
rt->dst.dev = br->dev;
- dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+ dst_init_metrics(&rt->dst, br->metrics, false);
+ dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
rt->dst.ops = &fake_dst_ops;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d5b3c5936a79..4715a8d6dc32 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -505,6 +505,7 @@ struct net_bridge {
struct rtable fake_rtable;
struct rt6_info fake_rt6_info;
};
+ u32 metrics[RTAX_MAX];
#endif
u16 group_fwd_mask;
u16 group_fwd_mask_required;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 0bca1b9b3f70..6bc1cc4c94c5 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -58,6 +58,7 @@
#include <linux/can/skb.h>
#include <linux/can/bcm.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -122,6 +123,7 @@ struct bcm_op {
struct canfd_frame last_sframe;
struct sock *sk;
struct net_device *rx_reg_dev;
+ spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */
};
struct bcm_sock {
@@ -217,7 +219,9 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex));
seq_printf(m, " <<<\n");
- list_for_each_entry(op, &bo->rx_ops, list) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(op, &bo->rx_ops, list) {
unsigned long reduction;
@@ -273,6 +277,9 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, "# sent %ld\n", op->frames_abs);
}
seq_putc(m, '\n');
+
+ rcu_read_unlock();
+
return 0;
}
#endif /* CONFIG_PROC_FS */
@@ -285,13 +292,18 @@ static void bcm_can_tx(struct bcm_op *op)
{
struct sk_buff *skb;
struct net_device *dev;
- struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
+ struct canfd_frame *cf;
int err;
/* no target device? => exit */
if (!op->ifindex)
return;
+ /* read currframe under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+ cf = op->frames + op->cfsiz * op->currframe;
+ spin_unlock_bh(&op->bcm_tx_lock);
+
dev = dev_get_by_index(sock_net(op->sk), op->ifindex);
if (!dev) {
/* RFC: should this bcm_op remove itself here? */
@@ -312,6 +324,10 @@ static void bcm_can_tx(struct bcm_op *op)
skb->dev = dev;
can_skb_set_owner(skb, op->sk);
err = can_send(skb, 1);
+
+ /* update currframe and count under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+
if (!err)
op->frames_abs++;
@@ -320,6 +336,11 @@ static void bcm_can_tx(struct bcm_op *op)
/* reached last frame? */
if (op->currframe >= op->nframes)
op->currframe = 0;
+
+ if (op->count > 0)
+ op->count--;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
out:
dev_put(dev);
}
@@ -430,7 +451,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
struct bcm_msg_head msg_head;
if (op->kt_ival1 && (op->count > 0)) {
- op->count--;
+ bcm_can_tx(op);
if (!op->count && (op->flags & TX_COUNTEVT)) {
/* create notification to user */
@@ -445,7 +466,6 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
bcm_send_to_user(op, &msg_head, NULL, 0);
}
- bcm_can_tx(op);
} else if (op->kt_ival2) {
bcm_can_tx(op);
@@ -843,7 +863,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
REGMASK(op->can_id),
bcm_rx_handler, op);
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return 1; /* done */
}
@@ -863,7 +883,7 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
list_for_each_entry_safe(op, n, ops, list) {
if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
(op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return 1; /* done */
}
@@ -956,6 +976,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
op->flags = msg_head->flags;
+ /* only lock for unlikely count/nframes/currframe changes */
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX ||
+ op->flags & SETTIMER) {
+
+ spin_lock_bh(&op->bcm_tx_lock);
+
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX) {
+ /* potentially update changed nframes */
+ op->nframes = msg_head->nframes;
+ /* restart multiple frame transmission */
+ op->currframe = 0;
+ }
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
+ }
+
} else {
/* insert new BCM operation for the given can_id */
@@ -963,9 +1004,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
+ spin_lock_init(&op->bcm_tx_lock);
op->can_id = msg_head->can_id;
op->cfsiz = CFSIZ(msg_head->flags);
op->flags = msg_head->flags;
+ op->nframes = msg_head->nframes;
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
/* create array for CAN frames and copy the data */
if (msg_head->nframes > 1) {
@@ -1023,22 +1069,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
} /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */
- if (op->nframes != msg_head->nframes) {
- op->nframes = msg_head->nframes;
- /* start multiple frame transmission with index 0 */
- op->currframe = 0;
- }
-
- /* check flags */
-
- if (op->flags & TX_RESET_MULTI_IDX) {
- /* start multiple frame transmission with index 0 */
- op->currframe = 0;
- }
-
if (op->flags & SETTIMER) {
/* set timer values */
- op->count = msg_head->count;
op->ival1 = msg_head->ival1;
op->ival2 = msg_head->ival2;
op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
@@ -1055,11 +1087,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->flags |= TX_ANNOUNCE;
}
- if (op->flags & TX_ANNOUNCE) {
+ if (op->flags & TX_ANNOUNCE)
bcm_can_tx(op);
- if (op->count)
- op->count--;
- }
if (op->flags & STARTTIMER)
bcm_tx_start_timer(op);
@@ -1272,7 +1301,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
bcm_rx_handler, op, "bcm", sk);
if (err) {
/* this bcm rx op is broken -> remove it */
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return err;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 11da1e272ec2..0d891634c692 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10441,6 +10441,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
+ netdev_lock_ops(lower);
lower->wanted_features &= ~feature;
__netdev_update_features(lower);
@@ -10449,6 +10450,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
&feature, lower->name);
else
netdev_features_change(lower);
+ netdev_unlock_ops(lower);
}
}
}
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 6e27a47d0493..2db428ab6b8b 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -200,6 +200,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
refcount_set(&binding->ref, 1);
+ mutex_init(&binding->lock);
+
binding->dmabuf = dmabuf;
binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
@@ -379,6 +381,11 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv,
xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
if (bound_rxq == rxq) {
xa_erase(&binding->bound_rxqs, xa_idx);
+ if (xa_empty(&binding->bound_rxqs)) {
+ mutex_lock(&binding->lock);
+ binding->dev = NULL;
+ mutex_unlock(&binding->lock);
+ }
break;
}
}
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 7fc158d52729..a1aabc9685cc 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -20,6 +20,8 @@ struct net_devmem_dmabuf_binding {
struct sg_table *sgt;
struct net_device *dev;
struct gen_pool *chunk_pool;
+ /* Protect dev */
+ struct mutex lock;
/* The user holds a ref (via the netlink API) for as long as they want
* the binding to remain alive. Each page pool using this binding holds
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index dae9f0d432fb..a877693fecd6 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -979,14 +979,25 @@ void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
{
struct net_devmem_dmabuf_binding *binding;
struct net_devmem_dmabuf_binding *temp;
+ netdevice_tracker dev_tracker;
struct net_device *dev;
mutex_lock(&priv->lock);
list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
+ mutex_lock(&binding->lock);
dev = binding->dev;
+ if (!dev) {
+ mutex_unlock(&binding->lock);
+ net_devmem_unbind_dmabuf(binding);
+ continue;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ mutex_unlock(&binding->lock);
+
netdev_lock(dev);
net_devmem_unbind_dmabuf(binding);
netdev_unlock(dev);
+ netdev_put(dev, &dev_tracker);
}
mutex_unlock(&priv->lock);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index e54449c9ab0b..1d9466a1f54e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -148,6 +148,8 @@
#include <linux/ethtool.h>
+#include <uapi/linux/pidfd.h>
+
#include "dev.h"
static DEFINE_MUTEX(proto_list_mutex);
@@ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
{
struct pid *peer_pid;
struct file *pidfd_file = NULL;
+ unsigned int flags = 0;
int pidfd;
if (len > sizeof(pidfd))
@@ -1891,7 +1894,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
if (!peer_pid)
return -ENODATA;
- pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
+ /* The use of PIDFD_STALE requires stashing of struct pid
+ * on pidfs with pidfs_register_pid() and only AF_UNIX
+ * were prepared for this.
+ */
+ if (sk->sk_family == AF_UNIX)
+ flags = PIDFD_STALE;
+
+ pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
put_pid(peer_pid);
if (pidfd < 0)
return pidfd;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index c33d4bf17929..0b7564b53790 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -140,7 +140,12 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
{
- u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ u8 *tag;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
return ksz_common_rcv(skb, dev, tag[0] & KSZ8795_TAIL_TAG_EG_PORT_M,
KSZ_EGRESS_TAG_LEN);
@@ -311,10 +316,16 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
{
- /* Tag decoding */
- u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
- unsigned int port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M;
unsigned int len = KSZ_EGRESS_TAG_LEN;
+ unsigned int port;
+ u8 *tag;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ /* Tag decoding */
+ tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M;
/* Extra 4-bytes PTP timestamp */
if (tag[0] & KSZ9477_PTP_TAG_INDICATION) {
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 0e4076866c0a..f14a41ee4aa1 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -120,47 +120,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
}
#ifdef CONFIG_INET_ESPINTCP
-struct esp_tcp_sk {
- struct sock *sk;
- struct rcu_head rcu;
-};
-
-static void esp_free_tcp_sk(struct rcu_head *head)
-{
- struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
-
- sock_put(esk->sk);
- kfree(esk);
-}
-
static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
struct net *net = xs_net(x);
- struct esp_tcp_sk *esk;
__be16 sport, dport;
- struct sock *nsk;
struct sock *sk;
- sk = rcu_dereference(x->encap_sk);
- if (sk && sk->sk_state == TCP_ESTABLISHED)
- return sk;
-
spin_lock_bh(&x->lock);
sport = encap->encap_sport;
dport = encap->encap_dport;
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (sk && sk == nsk) {
- esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
- if (!esk) {
- spin_unlock_bh(&x->lock);
- return ERR_PTR(-ENOMEM);
- }
- RCU_INIT_POINTER(x->encap_sk, NULL);
- esk->sk = sk;
- call_rcu(&esk->rcu, esp_free_tcp_sk);
- }
spin_unlock_bh(&x->lock);
sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
@@ -173,20 +142,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
return ERR_PTR(-EINVAL);
}
- spin_lock_bh(&x->lock);
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (encap->encap_sport != sport ||
- encap->encap_dport != dport) {
- sock_put(sk);
- sk = nsk ?: ERR_PTR(-EREMCHG);
- } else if (sk == nsk) {
- sock_put(sk);
- } else {
- rcu_assign_pointer(x->encap_sk, sk);
- }
- spin_unlock_bh(&x->lock);
-
return sk;
}
@@ -199,8 +154,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
sk = esp_find_tcp_sk(x);
err = PTR_ERR_OR_ZERO(sk);
- if (err)
+ if (err) {
+ kfree_skb(skb);
goto out;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -209,6 +166,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
err = espintcp_push_skb(sk, skb);
bh_unlock_sock(sk);
+ sock_put(sk);
+
out:
rcu_read_unlock();
return err;
@@ -392,6 +351,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
if (IS_ERR(sk))
return ERR_CAST(sk);
+ sock_put(sk);
+
*lenp = htons(len);
esph = (struct ip_esp_hdr *)(lenp + 1);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a8b04d4abcaa..85dc208f32e9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -120,11 +120,6 @@ static void ipmr_expire_process(struct timer_list *t);
lockdep_rtnl_is_held() || \
list_empty(&net->ipv4.mr_tables))
-static bool ipmr_can_free_table(struct net *net)
-{
- return !check_net(net) || !net_initialized(net);
-}
-
static struct mr_table *ipmr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -317,11 +312,6 @@ EXPORT_SYMBOL(ipmr_rule_default);
#define ipmr_for_each_table(mrt, net) \
for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
-static bool ipmr_can_free_table(struct net *net)
-{
- return !check_net(net);
-}
-
static struct mr_table *ipmr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -437,7 +427,7 @@ static void ipmr_free_table(struct mr_table *mrt)
{
struct net *net = read_pnet(&mrt->net);
- WARN_ON_ONCE(!ipmr_can_free_table(net));
+ WARN_ON_ONCE(!mr_can_free_table(net));
timer_shutdown_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index b5b06323cfd9..0d31a8c108d4 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -182,11 +182,15 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
int offset = skb_gro_offset(skb);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
- int ret;
-
- offset = offset - sizeof(struct udphdr);
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
- if (!pskb_pull(skb, offset))
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
return NULL;
rcu_read_lock();
@@ -194,11 +198,10 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
if (!ops || !ops->callbacks.gro_receive)
goto out;
- ret = __xfrm4_udp_encap_rcv(sk, skb, false);
- if (ret)
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
goto out;
- skb_push(skb, offset);
NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
@@ -208,7 +211,6 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
out:
rcu_read_unlock();
- skb_push(skb, offset);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 1;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9e73944e3b53..72adfc107b55 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -137,47 +137,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
}
#ifdef CONFIG_INET6_ESPINTCP
-struct esp_tcp_sk {
- struct sock *sk;
- struct rcu_head rcu;
-};
-
-static void esp_free_tcp_sk(struct rcu_head *head)
-{
- struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu);
-
- sock_put(esk->sk);
- kfree(esk);
-}
-
static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
{
struct xfrm_encap_tmpl *encap = x->encap;
struct net *net = xs_net(x);
- struct esp_tcp_sk *esk;
__be16 sport, dport;
- struct sock *nsk;
struct sock *sk;
- sk = rcu_dereference(x->encap_sk);
- if (sk && sk->sk_state == TCP_ESTABLISHED)
- return sk;
-
spin_lock_bh(&x->lock);
sport = encap->encap_sport;
dport = encap->encap_dport;
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (sk && sk == nsk) {
- esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
- if (!esk) {
- spin_unlock_bh(&x->lock);
- return ERR_PTR(-ENOMEM);
- }
- RCU_INIT_POINTER(x->encap_sk, NULL);
- esk->sk = sk;
- call_rcu(&esk->rcu, esp_free_tcp_sk);
- }
spin_unlock_bh(&x->lock);
sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6,
@@ -190,20 +159,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
return ERR_PTR(-EINVAL);
}
- spin_lock_bh(&x->lock);
- nsk = rcu_dereference_protected(x->encap_sk,
- lockdep_is_held(&x->lock));
- if (encap->encap_sport != sport ||
- encap->encap_dport != dport) {
- sock_put(sk);
- sk = nsk ?: ERR_PTR(-EREMCHG);
- } else if (sk == nsk) {
- sock_put(sk);
- } else {
- rcu_assign_pointer(x->encap_sk, sk);
- }
- spin_unlock_bh(&x->lock);
-
return sk;
}
@@ -216,8 +171,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
sk = esp6_find_tcp_sk(x);
err = PTR_ERR_OR_ZERO(sk);
- if (err)
+ if (err) {
+ kfree_skb(skb);
goto out;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
@@ -226,6 +183,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
err = espintcp_push_skb(sk, skb);
bh_unlock_sock(sk);
+ sock_put(sk);
+
out:
rcu_read_unlock();
return err;
@@ -422,6 +381,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
if (IS_ERR(sk))
return ERR_CAST(sk);
+ sock_put(sk);
+
*lenp = htons(len);
esph = (struct ip_esp_hdr *)(lenp + 1);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index b413c9c8a21c..3276cde5ebd7 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -108,11 +108,6 @@ static void ipmr_expire_process(struct timer_list *t);
lockdep_rtnl_is_held() || \
list_empty(&net->ipv6.mr6_tables))
-static bool ip6mr_can_free_table(struct net *net)
-{
- return !check_net(net) || !net_initialized(net);
-}
-
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -306,11 +301,6 @@ EXPORT_SYMBOL(ip6mr_rule_default);
#define ip6mr_for_each_table(mrt, net) \
for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
-static bool ip6mr_can_free_table(struct net *net)
-{
- return !check_net(net);
-}
-
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
struct mr_table *mrt)
{
@@ -416,7 +406,7 @@ static void ip6mr_free_table(struct mr_table *mrt)
{
struct net *net = read_pnet(&mrt->net);
- WARN_ON_ONCE(!ip6mr_can_free_table(net));
+ WARN_ON_ONCE(!mr_can_free_table(net));
timer_shutdown_sync(&mrt->ipmr_expire_timer);
mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 4abc5e9d6322..841c81abaaf4 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -179,14 +179,18 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
int offset = skb_gro_offset(skb);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
- int ret;
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
if (skb->protocol == htons(ETH_P_IP))
return xfrm4_gro_udp_encap_rcv(sk, head, skb);
- offset = offset - sizeof(struct udphdr);
-
- if (!pskb_pull(skb, offset))
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
return NULL;
rcu_read_lock();
@@ -194,11 +198,10 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
if (!ops || !ops->callbacks.gro_receive)
goto out;
- ret = __xfrm6_udp_encap_rcv(sk, skb, false);
- if (ret)
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
goto out;
- skb_push(skb, offset);
NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
@@ -208,7 +211,6 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
out:
rcu_read_unlock();
- skb_push(skb, offset);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 1;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 0259cde394ba..cc77ec5769d8 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -887,15 +887,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (sk->sk_type != SOCK_STREAM)
goto copy_uaddr;
+ /* Partial read */
+ if (used + offset < skb_len)
+ continue;
+
if (!(flags & MSG_PEEK)) {
skb_unlink(skb, &sk->sk_receive_queue);
kfree_skb(skb);
*seq = 0;
}
-
- /* Partial read */
- if (used + offset < skb_len)
- continue;
} while (len > 0);
out:
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 741e6c7edcb7..6b6de43d9420 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -1354,10 +1354,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR);
- local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) +
- sizeof(void *) * channels, GFP_KERNEL);
+ local->int_scan_req = kzalloc(struct_size(local->int_scan_req,
+ channels, channels),
+ GFP_KERNEL);
if (!local->int_scan_req)
return -ENOMEM;
+ local->int_scan_req->n_channels = channels;
eth_broadcast_addr(local->int_scan_req->bssid);
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 8e0724c56723..7c0dcf3df319 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -117,11 +117,18 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct net_device *dev;
struct ifaddrmsg *hdr;
struct mctp_dev *mdev;
- int ifindex, rc;
-
- hdr = nlmsg_data(cb->nlh);
- // filter by ifindex if requested
- ifindex = hdr->ifa_index;
+ int ifindex = 0, rc;
+
+ /* Filter by ifindex if a header is provided */
+ if (cb->nlh->nlmsg_len >= nlmsg_msg_size(sizeof(*hdr))) {
+ hdr = nlmsg_data(cb->nlh);
+ ifindex = hdr->ifa_index;
+ } else {
+ if (cb->strict_check) {
+ NL_SET_ERR_MSG(cb->extack, "mctp: Invalid header for addr dump request");
+ return -EINVAL;
+ }
+ }
rcu_read_lock();
for_each_netdev_dump(net, dev, mcb->ifindex) {
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 4c460160914f..d9c8e5a5f9ce 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -313,8 +313,10 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev)
key = flow->key;
- if (WARN_ON(key->dev && key->dev != dev))
+ if (key->dev) {
+ WARN_ON(key->dev != dev);
return;
+ }
mctp_dev_set_key(dev, key);
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 12dd71139da3..c93761040c6e 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -144,7 +144,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt,
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 2ca5332cfcc5..902ff5470607 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -1136,7 +1136,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
sch_tree_lock(sch);
}
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = fq_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
if (!skb)
break;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c9029f71e88..2a0f3a513bfa 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -441,7 +441,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt,
while (sch->q.qlen > sch->limit ||
q->memory_usage > q->memory_limit) {
- struct sk_buff *skb = fq_codel_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
q->cstats.drop_len += qdisc_pkt_len(skb);
rtnl_kfree_skbs(skb, skb);
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index f3b8203d3e85..df7fac95ab15 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -366,7 +366,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt,
/* Drop excess packets if new limit is lower */
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = fq_pie_qdisc_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
len_dropped += qdisc_pkt_len(skb);
num_dropped += 1;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index cb8c525ea20e..7986145a527c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1569,6 +1569,9 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
return err;
}
+ sch->qstats.backlog += len;
+ sch->q.qlen++;
+
if (first && !cl->cl_nactive) {
if (cl->cl_flags & HFSC_RSC)
init_ed(cl, len);
@@ -1584,9 +1587,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
}
- sch->qstats.backlog += len;
- sch->q.qlen++;
-
return NET_XMIT_SUCCESS;
}
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 44d9efe1a96a..5aa434b46707 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -564,7 +564,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt,
qlen = sch->q.qlen;
prev_backlog = sch->qstats.backlog;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = hhf_dequeue(sch);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, false);
rtnl_kfree_skbs(skb, skb);
}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 3771d000b30d..ff49a6c97033 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -195,7 +195,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
- struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
+ struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index eadc00410ebc..98f78cd55905 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -631,7 +631,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
const char *name)
{
struct qstr q = QSTR(name);
- struct dentry *dentry = d_hash_and_lookup(parent, &q);
+ struct dentry *dentry = try_lookup_noperm(&q, parent);
if (!dentry) {
dentry = d_alloc(parent, &q);
if (!dentry)
@@ -658,7 +658,7 @@ static void __rpc_depopulate(struct dentry *parent,
for (i = start; i < eof; i++) {
name.name = files[i].name;
name.len = strlen(files[i].name);
- dentry = d_hash_and_lookup(parent, &name);
+ dentry = try_lookup_noperm(&name, parent);
if (dentry == NULL)
continue;
@@ -1190,7 +1190,7 @@ static const struct rpc_filelist files[] = {
struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
const unsigned char *dir_name)
{
- return d_hash_and_lookup(sb->s_root, &QSTR(dir_name));
+ return try_lookup_noperm(&QSTR(dir_name), sb->s_root);
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
@@ -1301,7 +1301,7 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
struct dentry *pipe_dentry = NULL;
/* We should never get this far if "gssd" doesn't exist */
- gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name));
+ gssd_dentry = try_lookup_noperm(&QSTR(files[RPCAUTH_gssd].name), root);
if (!gssd_dentry)
return ERR_PTR(-ENOENT);
@@ -1311,8 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
goto out;
}
- clnt_dentry = d_hash_and_lookup(gssd_dentry,
- &QSTR(gssd_dummy_clnt_dir[0].name));
+ clnt_dentry = try_lookup_noperm(&QSTR(gssd_dummy_clnt_dir[0].name),
+ gssd_dentry);
if (!clnt_dentry) {
__rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
pipe_dentry = ERR_PTR(-ENOENT);
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index c524421ec652..8584893b4785 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -817,12 +817,16 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb,
goto exit;
}
+ /* Get net to avoid freed tipc_crypto when delete namespace */
+ get_net(aead->crypto->net);
+
/* Now, do encrypt */
rc = crypto_aead_encrypt(req);
if (rc == -EINPROGRESS || rc == -EBUSY)
return rc;
tipc_bearer_put(b);
+ put_net(aead->crypto->net);
exit:
kfree(ctx);
@@ -860,6 +864,7 @@ static void tipc_aead_encrypt_done(void *data, int err)
kfree(tx_ctx);
tipc_bearer_put(b);
tipc_aead_put(aead);
+ put_net(net);
}
/**
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index 77e33e1e340e..65b0da6fdf6a 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -396,7 +396,6 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
return 0;
shinfo = skb_shinfo(strp->anchor);
- shinfo->frag_list = NULL;
/* If we don't know the length go max plus page for cipher overhead */
need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE;
@@ -412,6 +411,8 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort)
page, 0, 0);
}
+ shinfo->frag_list = NULL;
+
strp->copy_mode = 1;
strp->stm.offset = 0;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f78a2492826f..59a64b2ced6e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -85,10 +85,13 @@
#include <linux/file.h>
#include <linux/filter.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mount.h>
#include <linux/namei.h>
+#include <linux/net.h>
+#include <linux/pidfs.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/sched/signal.h>
@@ -643,6 +646,9 @@ static void unix_sock_destructor(struct sock *sk)
return;
}
+ if (sk->sk_peer_pid)
+ pidfs_put_pid(sk->sk_peer_pid);
+
if (u->addr)
unix_release_addr(u->addr);
@@ -734,13 +740,48 @@ static void unix_release_sock(struct sock *sk, int embrion)
unix_gc(); /* Garbage collect fds */
}
-static void init_peercred(struct sock *sk)
+struct unix_peercred {
+ struct pid *peer_pid;
+ const struct cred *peer_cred;
+};
+
+static inline int prepare_peercred(struct unix_peercred *peercred)
{
- sk->sk_peer_pid = get_pid(task_tgid(current));
- sk->sk_peer_cred = get_current_cred();
+ struct pid *pid;
+ int err;
+
+ pid = task_tgid(current);
+ err = pidfs_register_pid(pid);
+ if (likely(!err)) {
+ peercred->peer_pid = get_pid(pid);
+ peercred->peer_cred = get_current_cred();
+ }
+ return err;
}
-static void update_peercred(struct sock *sk)
+static void drop_peercred(struct unix_peercred *peercred)
+{
+ const struct cred *cred = NULL;
+ struct pid *pid = NULL;
+
+ might_sleep();
+
+ swap(peercred->peer_pid, pid);
+ swap(peercred->peer_cred, cred);
+
+ pidfs_put_pid(pid);
+ put_pid(pid);
+ put_cred(cred);
+}
+
+static inline void init_peercred(struct sock *sk,
+ const struct unix_peercred *peercred)
+{
+ sk->sk_peer_pid = peercred->peer_pid;
+ sk->sk_peer_cred = peercred->peer_cred;
+}
+
+static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
{
const struct cred *old_cred;
struct pid *old_pid;
@@ -748,11 +789,11 @@ static void update_peercred(struct sock *sk)
spin_lock(&sk->sk_peer_lock);
old_pid = sk->sk_peer_pid;
old_cred = sk->sk_peer_cred;
- init_peercred(sk);
+ init_peercred(sk, peercred);
spin_unlock(&sk->sk_peer_lock);
- put_pid(old_pid);
- put_cred(old_cred);
+ peercred->peer_pid = old_pid;
+ peercred->peer_cred = old_cred;
}
static void copy_peercred(struct sock *sk, struct sock *peersk)
@@ -761,6 +802,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
spin_lock(&sk->sk_peer_lock);
sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
+ pidfs_get_pid(sk->sk_peer_pid);
sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
spin_unlock(&sk->sk_peer_lock);
}
@@ -770,6 +812,7 @@ static int unix_listen(struct socket *sock, int backlog)
int err;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
+ struct unix_peercred peercred = {};
err = -EOPNOTSUPP;
if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@@ -777,6 +820,9 @@ static int unix_listen(struct socket *sock, int backlog)
err = -EINVAL;
if (!READ_ONCE(u->addr))
goto out; /* No listens on an unbound socket */
+ err = prepare_peercred(&peercred);
+ if (err)
+ goto out;
unix_state_lock(sk);
if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
goto out_unlock;
@@ -786,11 +832,12 @@ static int unix_listen(struct socket *sock, int backlog)
WRITE_ONCE(sk->sk_state, TCP_LISTEN);
/* set credentials so connect can copy them */
- update_peercred(sk);
+ update_peercred(sk, &peercred);
err = 0;
out_unlock:
unix_state_unlock(sk);
+ drop_peercred(&peercred);
out:
return err;
}
@@ -1101,7 +1148,7 @@ static int unix_release(struct socket *sock)
}
static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
- int type)
+ int type, int flags)
{
struct inode *inode;
struct path path;
@@ -1109,13 +1156,39 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
int err;
unix_mkname_bsd(sunaddr, addr_len);
- err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
- if (err)
- goto fail;
- err = path_permission(&path, MAY_WRITE);
- if (err)
- goto path_put;
+ if (flags & SOCK_COREDUMP) {
+ const struct cred *cred;
+ struct cred *kcred;
+ struct path root;
+
+ kcred = prepare_kernel_cred(&init_task);
+ if (!kcred) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+
+ cred = override_creds(kcred);
+ err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
+ LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
+ LOOKUP_NO_MAGICLINKS, &path);
+ put_cred(revert_creds(cred));
+ path_put(&root);
+ if (err)
+ goto fail;
+ } else {
+ err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
+ if (err)
+ goto fail;
+
+ err = path_permission(&path, MAY_WRITE);
+ if (err)
+ goto path_put;
+ }
err = -ECONNREFUSED;
inode = d_backing_inode(path.dentry);
@@ -1165,12 +1238,12 @@ static struct sock *unix_find_abstract(struct net *net,
static struct sock *unix_find_other(struct net *net,
struct sockaddr_un *sunaddr,
- int addr_len, int type)
+ int addr_len, int type, int flags)
{
struct sock *sk;
if (sunaddr->sun_path[0])
- sk = unix_find_bsd(sunaddr, addr_len, type);
+ sk = unix_find_bsd(sunaddr, addr_len, type, flags);
else
sk = unix_find_abstract(net, sunaddr, addr_len, type);
@@ -1428,7 +1501,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
}
restart:
- other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
+ other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0);
if (IS_ERR(other)) {
err = PTR_ERR(other);
goto out;
@@ -1525,6 +1598,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
struct unix_sock *u = unix_sk(sk), *newu, *otheru;
+ struct unix_peercred peercred = {};
struct net *net = sock_net(sk);
struct sk_buff *skb = NULL;
unsigned char state;
@@ -1561,6 +1635,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
goto out;
}
+ err = prepare_peercred(&peercred);
+ if (err)
+ goto out;
+
/* Allocate skb for sending to listening sock */
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
if (!skb) {
@@ -1570,7 +1648,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
restart:
/* Find listening sock. */
- other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
+ other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags);
if (IS_ERR(other)) {
err = PTR_ERR(other);
goto out_free_skb;
@@ -1636,7 +1714,7 @@ restart:
unix_peer(newsk) = sk;
newsk->sk_state = TCP_ESTABLISHED;
newsk->sk_type = sk->sk_type;
- init_peercred(newsk);
+ init_peercred(newsk, &peercred);
newu = unix_sk(newsk);
newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
@@ -1695,20 +1773,33 @@ out_free_skb:
out_free_sk:
unix_release_sock(newsk, 0);
out:
+ drop_peercred(&peercred);
return err;
}
static int unix_socketpair(struct socket *socka, struct socket *sockb)
{
+ struct unix_peercred ska_peercred = {}, skb_peercred = {};
struct sock *ska = socka->sk, *skb = sockb->sk;
+ int err;
+
+ err = prepare_peercred(&ska_peercred);
+ if (err)
+ return err;
+
+ err = prepare_peercred(&skb_peercred);
+ if (err) {
+ drop_peercred(&ska_peercred);
+ return err;
+ }
/* Join our sockets back to back */
sock_hold(ska);
sock_hold(skb);
unix_peer(ska) = skb;
unix_peer(skb) = ska;
- init_peercred(ska);
- init_peercred(skb);
+ init_peercred(ska, &ska_peercred);
+ init_peercred(skb, &skb_peercred);
ska->sk_state = TCP_ESTABLISHED;
skb->sk_state = TCP_ESTABLISHED;
@@ -2026,7 +2117,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
if (msg->msg_namelen) {
lookup:
other = unix_find_other(sock_net(sk), msg->msg_name,
- msg->msg_namelen, sk->sk_type);
+ msg->msg_namelen, sk->sk_type, 0);
if (IS_ERR(other)) {
err = PTR_ERR(other);
goto out_free;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4abc81f33d3e..72c000c0ae5f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1304,7 +1304,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->queue_id = qid;
xp_add_xsk(xs->pool, xs);
- if (xs->zc && qid < dev->real_num_rx_queues) {
+ if (qid < dev->real_num_rx_queues) {
struct netdev_rx_queue *rxq;
rxq = __netif_get_rx_queue(dev, qid);
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index fe82e2d07300..fc7a603b04f1 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -171,8 +171,10 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb)
struct espintcp_ctx *ctx = espintcp_getctx(sk);
if (skb_queue_len(&ctx->out_queue) >=
- READ_ONCE(net_hotdata.max_backlog))
+ READ_ONCE(net_hotdata.max_backlog)) {
+ kfree_skb(skb);
return -ENOBUFS;
+ }
__skb_queue_tail(&ctx->out_queue, skb);
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 0c1420534394..907c3ccb440d 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -48,7 +48,6 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
{
struct acomp_req *req = ipcomp_cb(skb)->req;
struct ipcomp_req_extra *extra;
- const int plen = skb->data_len;
struct scatterlist *dsg;
int len, dlen;
@@ -64,7 +63,7 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen)
/* Only update truesize on input. */
if (!hlen)
- skb->truesize += dlen - plen;
+ skb->truesize += dlen;
skb->data_len = dlen;
skb->len += dlen;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 143ac3aa7537..f4bad8c895d6 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1581,6 +1581,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
struct xfrm_policy *delpol;
struct hlist_head *chain;
+ /* Sanitize mark before store */
+ policy->mark.v &= policy->mark.m;
+
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
if (chain)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 341d79ecb5c2..07fe8e5daa32 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -838,9 +838,6 @@ int __xfrm_state_delete(struct xfrm_state *x)
xfrm_nat_keepalive_state_updated(x);
spin_unlock(&net->xfrm.xfrm_state_lock);
- if (x->encap_sk)
- sock_put(rcu_dereference_raw(x->encap_sk));
-
xfrm_dev_state_delete(x);
/* All xfrm_state objects are created by xfrm_state_alloc.
@@ -1721,6 +1718,9 @@ static void __xfrm_state_insert(struct xfrm_state *x)
list_add(&x->km.all, &net->xfrm.state_all);
+ /* Sanitize mark before store */
+ x->mark.v &= x->mark.m;
+
h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr,
x->props.reqid, x->props.family);
XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h,
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 6039afae4bfc..0aef34b9609b 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -283,7 +283,7 @@ static struct dentry *aafs_create(const char *name, umode_t mode,
dir = d_inode(parent);
inode_lock(dir);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
goto fail_lock;
@@ -2551,7 +2551,7 @@ static int aa_mk_null_file(struct dentry *parent)
return error;
inode_lock(d_inode(parent));
- dentry = lookup_one_len(NULL_FILE_NAME, parent, strlen(NULL_FILE_NAME));
+ dentry = lookup_noperm(&QSTR(NULL_FILE_NAME), parent);
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
goto out;
diff --git a/security/inode.c b/security/inode.c
index da3ab44c8e57..3913501621fa 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -128,7 +128,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
dir = d_inode(parent);
inode_lock(dir);
- dentry = lookup_one_len(name, parent, strlen(name));
+ dentry = lookup_noperm(&QSTR(name), parent);
if (IS_ERR(dentry))
goto out;
diff --git a/security/landlock/audit.c b/security/landlock/audit.c
index 7e5e0ed0e4e5..c52d079cdb77 100644
--- a/security/landlock/audit.c
+++ b/security/landlock/audit.c
@@ -175,7 +175,7 @@ static void test_get_hierarchy(struct kunit *const test)
KUNIT_EXPECT_EQ(test, 10, get_hierarchy(&dom2, 0)->id);
KUNIT_EXPECT_EQ(test, 20, get_hierarchy(&dom2, 1)->id);
KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, 2)->id);
- KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, -1)->id);
+ /* KUNIT_EXPECT_EQ(test, 30, get_hierarchy(&dom2, -1)->id); */
}
#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */
@@ -437,7 +437,7 @@ void landlock_log_denial(const struct landlock_cred_security *const subject,
return;
/* Checks if the current exec was restricting itself. */
- if (subject->domain_exec & (1 << youngest_layer)) {
+ if (subject->domain_exec & BIT(youngest_layer)) {
/* Ignores denials for the same execution. */
if (!youngest_denied->log_same_exec)
return;
diff --git a/security/landlock/id.c b/security/landlock/id.c
index 11fab9259c15..56f7cc0fc744 100644
--- a/security/landlock/id.c
+++ b/security/landlock/id.c
@@ -7,6 +7,7 @@
#include <kunit/test.h>
#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/random.h>
#include <linux/spinlock.h>
@@ -25,7 +26,7 @@ static void __init init_id(atomic64_t *const counter, const u32 random_32bits)
* Ensures sure 64-bit values are always used by user space (or may
* fail with -EOVERFLOW), and makes this testable.
*/
- init = 1ULL << 32;
+ init = BIT_ULL(32);
/*
* Makes a large (2^32) boot-time value to limit ID collision in logs
@@ -105,7 +106,7 @@ static u64 get_id_range(size_t number_of_ids, atomic64_t *const counter,
* to get a new ID (e.g. a full landlock_restrict_self() call), and the
* cost of draining all available IDs during the system's uptime.
*/
- random_4bits = random_4bits % (1 << 4);
+ random_4bits &= 0b1111;
step = number_of_ids + random_4bits;
/* It is safe to cast a signed atomic to an unsigned value. */
@@ -144,6 +145,19 @@ static void test_range1_rand1(struct kunit *const test)
init + 2);
}
+static void test_range1_rand15(struct kunit *const test)
+{
+ atomic64_t counter;
+ u64 init;
+
+ init = get_random_u32();
+ atomic64_set(&counter, init);
+ KUNIT_EXPECT_EQ(test, get_id_range(1, &counter, 15), init);
+ KUNIT_EXPECT_EQ(
+ test, get_id_range(get_random_u8(), &counter, get_random_u8()),
+ init + 16);
+}
+
static void test_range1_rand16(struct kunit *const test)
{
atomic64_t counter;
@@ -196,6 +210,19 @@ static void test_range2_rand2(struct kunit *const test)
init + 4);
}
+static void test_range2_rand15(struct kunit *const test)
+{
+ atomic64_t counter;
+ u64 init;
+
+ init = get_random_u32();
+ atomic64_set(&counter, init);
+ KUNIT_EXPECT_EQ(test, get_id_range(2, &counter, 15), init);
+ KUNIT_EXPECT_EQ(
+ test, get_id_range(get_random_u8(), &counter, get_random_u8()),
+ init + 17);
+}
+
static void test_range2_rand16(struct kunit *const test)
{
atomic64_t counter;
@@ -232,10 +259,12 @@ static struct kunit_case __refdata test_cases[] = {
KUNIT_CASE(test_init_once),
KUNIT_CASE(test_range1_rand0),
KUNIT_CASE(test_range1_rand1),
+ KUNIT_CASE(test_range1_rand15),
KUNIT_CASE(test_range1_rand16),
KUNIT_CASE(test_range2_rand0),
KUNIT_CASE(test_range2_rand1),
KUNIT_CASE(test_range2_rand2),
+ KUNIT_CASE(test_range2_rand15),
KUNIT_CASE(test_range2_rand16),
{}
/* clang-format on */
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index b9561e3417ae..33eafb71e4f3 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -9,6 +9,7 @@
#include <asm/current.h>
#include <linux/anon_inodes.h>
+#include <linux/bitops.h>
#include <linux/build_bug.h>
#include <linux/capability.h>
#include <linux/cleanup.h>
@@ -563,7 +564,7 @@ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
new_llcred->domain = new_dom;
#ifdef CONFIG_AUDIT
- new_llcred->domain_exec |= 1 << (new_dom->num_layers - 1);
+ new_llcred->domain_exec |= BIT(new_dom->num_layers - 1);
#endif /* CONFIG_AUDIT */
return commit_creds(new_cred);
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 47480eb2189b..e67a8ce4b64c 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -2158,8 +2158,8 @@ static int __init init_sel_fs(void)
return err;
}
- selinux_null.dentry = d_hash_and_lookup(selinux_null.mnt->mnt_root,
- &null_name);
+ selinux_null.dentry = try_lookup_noperm(&null_name,
+ selinux_null.mnt->mnt_root);
if (IS_ERR(selinux_null.dentry)) {
pr_err("selinuxfs: could not lookup null!\n");
err = PTR_ERR(selinux_null.dentry);
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 4683b9139c56..4ecb17bd5436 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1074,8 +1074,7 @@ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream)
runtime->oss.params = 0;
runtime->oss.prepare = 1;
runtime->oss.buffer_used = 0;
- if (runtime->dma_area)
- snd_pcm_format_set_silence(runtime->format, runtime->dma_area, bytes_to_samples(runtime, runtime->dma_bytes));
+ snd_pcm_runtime_buffer_set_silence(runtime);
runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 6c2b6a62d9d2..853ac5bb33ff 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -723,6 +723,17 @@ static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime)
atomic_inc(&runtime->buffer_accessing);
}
+/* fill the PCM buffer with the current silence format; called from pcm_oss.c */
+void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime)
+{
+ snd_pcm_buffer_access_lock(runtime);
+ if (runtime->dma_area)
+ snd_pcm_format_set_silence(runtime->format, runtime->dma_area,
+ bytes_to_samples(runtime, runtime->dma_bytes));
+ snd_pcm_buffer_access_unlock(runtime);
+}
+EXPORT_SYMBOL_GPL(snd_pcm_runtime_buffer_set_silence);
+
#if IS_ENABLED(CONFIG_SND_PCM_OSS)
#define is_oss_stream(substream) ((substream)->oss.oss)
#else
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 198c598a5393..880240924bfd 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -732,15 +732,21 @@ static int snd_seq_deliver_single_event(struct snd_seq_client *client,
*/
static int __deliver_to_subscribers(struct snd_seq_client *client,
struct snd_seq_event *event,
- struct snd_seq_client_port *src_port,
- int atomic, int hop)
+ int port, int atomic, int hop)
{
+ struct snd_seq_client_port *src_port;
struct snd_seq_subscribers *subs;
int err, result = 0, num_ev = 0;
union __snd_seq_event event_saved;
size_t saved_size;
struct snd_seq_port_subs_info *grp;
+ if (port < 0)
+ return 0;
+ src_port = snd_seq_port_use_ptr(client, port);
+ if (!src_port)
+ return 0;
+
/* save original event record */
saved_size = snd_seq_event_packet_size(event);
memcpy(&event_saved, event, saved_size);
@@ -775,6 +781,7 @@ static int __deliver_to_subscribers(struct snd_seq_client *client,
read_unlock(&grp->list_lock);
else
up_read(&grp->list_mutex);
+ snd_seq_port_unlock(src_port);
memcpy(event, &event_saved, saved_size);
return (result < 0) ? result : num_ev;
}
@@ -783,25 +790,32 @@ static int deliver_to_subscribers(struct snd_seq_client *client,
struct snd_seq_event *event,
int atomic, int hop)
{
- struct snd_seq_client_port *src_port;
- int ret = 0, ret2;
-
- src_port = snd_seq_port_use_ptr(client, event->source.port);
- if (src_port) {
- ret = __deliver_to_subscribers(client, event, src_port, atomic, hop);
- snd_seq_port_unlock(src_port);
- }
-
- if (client->ump_endpoint_port < 0 ||
- event->source.port == client->ump_endpoint_port)
- return ret;
+ int ret;
+#if IS_ENABLED(CONFIG_SND_SEQ_UMP)
+ int ret2;
+#endif
- src_port = snd_seq_port_use_ptr(client, client->ump_endpoint_port);
- if (!src_port)
+ ret = __deliver_to_subscribers(client, event,
+ event->source.port, atomic, hop);
+#if IS_ENABLED(CONFIG_SND_SEQ_UMP)
+ if (!snd_seq_client_is_ump(client) || client->ump_endpoint_port < 0)
return ret;
- ret2 = __deliver_to_subscribers(client, event, src_port, atomic, hop);
- snd_seq_port_unlock(src_port);
- return ret2 < 0 ? ret2 : ret;
+ /* If it's an event from EP port (and with a UMP group),
+ * deliver to subscribers of the corresponding UMP group port, too.
+ * Or, if it's from non-EP port, deliver to subscribers of EP port, too.
+ */
+ if (event->source.port == client->ump_endpoint_port)
+ ret2 = __deliver_to_subscribers(client, event,
+ snd_seq_ump_group_port(event),
+ atomic, hop);
+ else
+ ret2 = __deliver_to_subscribers(client, event,
+ client->ump_endpoint_port,
+ atomic, hop);
+ if (ret2 < 0)
+ return ret2;
+#endif
+ return ret;
}
/* deliver an event to the destination port(s).
diff --git a/sound/core/seq/seq_ump_convert.c b/sound/core/seq/seq_ump_convert.c
index ff7e558b4d51..db2f169cae11 100644
--- a/sound/core/seq/seq_ump_convert.c
+++ b/sound/core/seq/seq_ump_convert.c
@@ -1285,3 +1285,21 @@ int snd_seq_deliver_to_ump(struct snd_seq_client *source,
else
return cvt_to_ump_midi1(dest, dest_port, event, atomic, hop);
}
+
+/* return the UMP group-port number of the event;
+ * return -1 if groupless or non-UMP event
+ */
+int snd_seq_ump_group_port(const struct snd_seq_event *event)
+{
+ const struct snd_seq_ump_event *ump_ev =
+ (const struct snd_seq_ump_event *)event;
+ unsigned char type;
+
+ if (!snd_seq_ev_is_ump(event))
+ return -1;
+ type = ump_message_type(ump_ev->ump[0]);
+ if (ump_is_groupless_msg(type))
+ return -1;
+ /* group-port number starts from 1 */
+ return ump_message_group(ump_ev->ump[0]) + 1;
+}
diff --git a/sound/core/seq/seq_ump_convert.h b/sound/core/seq/seq_ump_convert.h
index 6c146d803280..4abf0a7637d7 100644
--- a/sound/core/seq/seq_ump_convert.h
+++ b/sound/core/seq/seq_ump_convert.h
@@ -18,5 +18,6 @@ int snd_seq_deliver_to_ump(struct snd_seq_client *source,
struct snd_seq_client_port *dest_port,
struct snd_seq_event *event,
int atomic, int hop);
+int snd_seq_ump_group_port(const struct snd_seq_event *event);
#endif /* __SEQ_UMP_CONVERT_H */
diff --git a/sound/hda/intel-sdw-acpi.c b/sound/hda/intel-sdw-acpi.c
index 8686adaf4531..d3511135f7d3 100644
--- a/sound/hda/intel-sdw-acpi.c
+++ b/sound/hda/intel-sdw-acpi.c
@@ -177,7 +177,7 @@ static acpi_status sdw_intel_acpi_cb(acpi_handle handle, u32 level,
* sdw_intel_startup() is required for creation of devices and bus
* startup
*/
-int sdw_intel_acpi_scan(acpi_handle *parent_handle,
+int sdw_intel_acpi_scan(acpi_handle parent_handle,
struct sdw_intel_acpi_info *info)
{
acpi_status status;
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index c6c018b40c69..4e0693f0ab0f 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -1561,7 +1561,7 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream)
struct snd_pcm_runtime *runtime = substream->runtime;
struct es1968 *chip = snd_pcm_substream_chip(substream);
struct esschan *es;
- int apu1, apu2;
+ int err, apu1, apu2;
apu1 = snd_es1968_alloc_apu_pair(chip, ESM_APU_PCM_CAPTURE);
if (apu1 < 0)
@@ -1605,7 +1605,9 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream)
runtime->hw = snd_es1968_capture;
runtime->hw.buffer_bytes_max = runtime->hw.period_bytes_max =
calc_available_memory_size(chip) - 1024; /* keep MIXBUF size */
- snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES);
+ err = snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES);
+ if (err < 0)
+ return err;
spin_lock_irq(&chip->substream_lock);
list_add(&es->list, &chip->substream_list);
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 8a2b09e4a7d5..20ab1fb2195f 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6830,7 +6830,10 @@ static void alc256_fixup_chromebook(struct hda_codec *codec,
switch (action) {
case HDA_FIXUP_ACT_PRE_PROBE:
- spec->gen.suppress_auto_mute = 1;
+ if (codec->core.subsystem_id == 0x10280d76)
+ spec->gen.suppress_auto_mute = 0;
+ else
+ spec->gen.suppress_auto_mute = 1;
spec->gen.suppress_auto_mic = 1;
spec->en_3kpull_low = false;
break;
@@ -10882,9 +10885,12 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8e1a, "HP ZBook Firefly 14 G12A", ALC245_FIXUP_HP_ZBOOK_FIREFLY_G12A),
SND_PCI_QUIRK(0x103c, 0x8e1b, "HP EliteBook G12", ALC245_FIXUP_HP_ZBOOK_FIREFLY_G12A),
SND_PCI_QUIRK(0x103c, 0x8e1c, "HP EliteBook G12", ALC245_FIXUP_HP_ZBOOK_FIREFLY_G12A),
+ SND_PCI_QUIRK(0x103c, 0x8e1d, "HP ZBook X Gli 16 G12", ALC236_FIXUP_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8e2c, "HP EliteBook 16 G12", ALC285_FIXUP_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8e36, "HP 14 Enstrom OmniBook X", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e37, "HP 16 Piston OmniBook X", ALC287_FIXUP_CS35L41_I2C_2),
+ SND_PCI_QUIRK(0x103c, 0x8e3a, "HP Agusta", ALC287_FIXUP_CS35L41_I2C_2),
+ SND_PCI_QUIRK(0x103c, 0x8e3b, "HP Agusta", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e60, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
@@ -11300,6 +11306,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x17aa, 0x38fa, "Thinkbook 16P Gen5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD),
SND_PCI_QUIRK(0x17aa, 0x38fd, "ThinkBook plus Gen5 Hybrid", ALC287_FIXUP_TAS2781_I2C),
SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
+ SND_PCI_QUIRK(0x17aa, 0x390d, "Lenovo Yoga Pro 7 14ASP10", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN),
SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC),
SND_PCI_QUIRK(0x17aa, 0x391f, "Yoga S990-16 pro Quad YC Quad", ALC287_FIXUP_TAS2781_I2C),
SND_PCI_QUIRK(0x17aa, 0x3920, "Yoga S990-16 pro Quad VECO Quad", ALC287_FIXUP_TAS2781_I2C),
diff --git a/sound/sh/Kconfig b/sound/sh/Kconfig
index b75fbb3236a7..f5fa09d740b4 100644
--- a/sound/sh/Kconfig
+++ b/sound/sh/Kconfig
@@ -14,7 +14,7 @@ if SND_SUPERH
config SND_AICA
tristate "Dreamcast Yamaha AICA sound"
- depends on SH_DREAMCAST
+ depends on SH_DREAMCAST && SH_DMA_API
select SND_PCM
select G2_DMA
help
diff --git a/sound/soc/mediatek/Kconfig b/sound/soc/mediatek/Kconfig
index 3033e2d3fe16..90e367586493 100644
--- a/sound/soc/mediatek/Kconfig
+++ b/sound/soc/mediatek/Kconfig
@@ -228,6 +228,7 @@ config SND_SOC_MT8188
config SND_SOC_MT8188_MT6359
tristate "ASoC Audio driver for MT8188 with MT6359 and I2S codecs"
depends on SND_SOC_MT8188 && MTK_PMIC_WRAP
+ depends on SND_SOC_MT6359_ACCDET || !SND_SOC_MT6359_ACCDET
depends on I2C
select SND_SOC_MT6359
select SND_SOC_HDMI_CODEC
diff --git a/sound/soc/sof/intel/hda-bus.c b/sound/soc/sof/intel/hda-bus.c
index b1be03011d7e..6492e1cefbfb 100644
--- a/sound/soc/sof/intel/hda-bus.c
+++ b/sound/soc/sof/intel/hda-bus.c
@@ -76,7 +76,7 @@ void sof_hda_bus_init(struct snd_sof_dev *sdev, struct device *dev)
snd_hdac_ext_bus_init(bus, dev, &bus_core_ops, sof_hda_ext_ops);
- if (chip && chip->hw_ip_version == SOF_INTEL_ACE_2_0)
+ if (chip && chip->hw_ip_version >= SOF_INTEL_ACE_2_0)
bus->use_pio_for_commands = true;
#else
snd_hdac_ext_bus_init(bus, dev, NULL, NULL);
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index b34e5fdf10f1..6a3932d90b43 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -1049,7 +1049,21 @@ static void hda_generic_machine_select(struct snd_sof_dev *sdev,
if (!*mach && codec_num <= 2) {
bool tplg_fixup = false;
- hda_mach = snd_soc_acpi_intel_hda_machines;
+ /*
+ * make a local copy of the match array since we might
+ * be modifying it
+ */
+ hda_mach = devm_kmemdup_array(sdev->dev,
+ snd_soc_acpi_intel_hda_machines,
+ 2, /* we have one entry + sentinel in the array */
+ sizeof(snd_soc_acpi_intel_hda_machines[0]),
+ GFP_KERNEL);
+ if (!hda_mach) {
+ dev_err(bus->dev,
+ "%s: failed to duplicate the HDA match table\n",
+ __func__);
+ return;
+ }
dev_info(bus->dev, "using HDA machine driver %s now\n",
hda_mach->drv_name);
diff --git a/sound/soc/sof/ipc4-control.c b/sound/soc/sof/ipc4-control.c
index 576f407cd456..976a4794d610 100644
--- a/sound/soc/sof/ipc4-control.c
+++ b/sound/soc/sof/ipc4-control.c
@@ -531,6 +531,14 @@ static int sof_ipc4_bytes_ext_put(struct snd_sof_control *scontrol,
return -EINVAL;
}
+ /* Check header id */
+ if (header.numid != SOF_CTRL_CMD_BINARY) {
+ dev_err_ratelimited(scomp->dev,
+ "Incorrect numid for bytes put %d\n",
+ header.numid);
+ return -EINVAL;
+ }
+
/* Verify the ABI header first */
if (copy_from_user(&abi_hdr, tlvd->tlv, sizeof(abi_hdr)))
return -EFAULT;
@@ -613,7 +621,8 @@ static int _sof_ipc4_bytes_ext_get(struct snd_sof_control *scontrol,
if (data_size > size)
return -ENOSPC;
- header.numid = scontrol->comp_id;
+ /* Set header id and length */
+ header.numid = SOF_CTRL_CMD_BINARY;
header.length = data_size;
if (copy_to_user(tlvd, &header, sizeof(struct snd_ctl_tlv)))
diff --git a/sound/soc/sof/ipc4-pcm.c b/sound/soc/sof/ipc4-pcm.c
index 1a2841899ff5..c09b424ab863 100644
--- a/sound/soc/sof/ipc4-pcm.c
+++ b/sound/soc/sof/ipc4-pcm.c
@@ -798,7 +798,8 @@ static int sof_ipc4_pcm_setup(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm
spcm->stream[stream].private = stream_priv;
- if (!support_info)
+ /* Delay reporting is only supported on playback */
+ if (!support_info || stream == SNDRV_PCM_STREAM_CAPTURE)
continue;
time_info = kzalloc(sizeof(*time_info), GFP_KERNEL);
diff --git a/sound/soc/sof/topology.c b/sound/soc/sof/topology.c
index dc9cb8324067..14aa8ecc4bc4 100644
--- a/sound/soc/sof/topology.c
+++ b/sound/soc/sof/topology.c
@@ -1063,7 +1063,7 @@ static int sof_connect_dai_widget(struct snd_soc_component *scomp,
struct snd_sof_dai *dai)
{
struct snd_soc_card *card = scomp->card;
- struct snd_soc_pcm_runtime *rtd;
+ struct snd_soc_pcm_runtime *rtd, *full, *partial;
struct snd_soc_dai *cpu_dai;
int stream;
int i;
@@ -1080,12 +1080,22 @@ static int sof_connect_dai_widget(struct snd_soc_component *scomp,
else
goto end;
+ full = NULL;
+ partial = NULL;
list_for_each_entry(rtd, &card->rtd_list, list) {
/* does stream match DAI link ? */
- if (!rtd->dai_link->stream_name ||
- !strstr(rtd->dai_link->stream_name, w->sname))
- continue;
+ if (rtd->dai_link->stream_name) {
+ if (!strcmp(rtd->dai_link->stream_name, w->sname)) {
+ full = rtd;
+ break;
+ } else if (strstr(rtd->dai_link->stream_name, w->sname)) {
+ partial = rtd;
+ }
+ }
+ }
+ rtd = full ? full : partial;
+ if (rtd) {
for_each_rtd_cpu_dais(rtd, i, cpu_dai) {
/*
* Please create DAI widget in the right order
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 9112313a9dbc..dbbc9eb935a4 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2242,6 +2242,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
QUIRK_FLAG_CTL_MSG_DELAY_1M),
DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */
QUIRK_FLAG_GET_SAMPLE_RATE),
+ DEVICE_FLG(0x0c45, 0x636b, /* Microdia JP001 USB Camera */
+ QUIRK_FLAG_GET_SAMPLE_RATE),
DEVICE_FLG(0x0d8c, 0x0014, /* USB Audio Device */
QUIRK_FLAG_CTL_MSG_DELAY_1M),
DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */
@@ -2250,6 +2252,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
QUIRK_FLAG_FIXED_RATE),
DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */
QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER),
+ DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */
+ QUIRK_FLAG_GET_SAMPLE_RATE),
DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */
QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16),
DEVICE_FLG(0x1395, 0x740a, /* Sennheiser DECT */
diff --git a/tools/include/uapi/linux/fanotify.h b/tools/include/uapi/linux/fanotify.h
new file mode 100644
index 000000000000..e710967c7c26
--- /dev/null
+++ b/tools/include/uapi/linux/fanotify.h
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_FANOTIFY_H
+#define _UAPI_LINUX_FANOTIFY_H
+
+#include <linux/types.h>
+
+/* the following events that user-space can register for */
+#define FAN_ACCESS 0x00000001 /* File was accessed */
+#define FAN_MODIFY 0x00000002 /* File was modified */
+#define FAN_ATTRIB 0x00000004 /* Metadata changed */
+#define FAN_CLOSE_WRITE 0x00000008 /* Writable file closed */
+#define FAN_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */
+#define FAN_OPEN 0x00000020 /* File was opened */
+#define FAN_MOVED_FROM 0x00000040 /* File was moved from X */
+#define FAN_MOVED_TO 0x00000080 /* File was moved to Y */
+#define FAN_CREATE 0x00000100 /* Subfile was created */
+#define FAN_DELETE 0x00000200 /* Subfile was deleted */
+#define FAN_DELETE_SELF 0x00000400 /* Self was deleted */
+#define FAN_MOVE_SELF 0x00000800 /* Self was moved */
+#define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */
+
+#define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
+#define FAN_FS_ERROR 0x00008000 /* Filesystem error */
+
+#define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
+#define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
+#define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */
+/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
+
+#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */
+#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */
+#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */
+
+#define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */
+
+#define FAN_RENAME 0x10000000 /* File was renamed */
+
+#define FAN_ONDIR 0x40000000 /* Event occurred against dir */
+
+/* helper events */
+#define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */
+#define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */
+
+/* flags used for fanotify_init() */
+#define FAN_CLOEXEC 0x00000001
+#define FAN_NONBLOCK 0x00000002
+
+/* These are NOT bitwise flags. Both bits are used together. */
+#define FAN_CLASS_NOTIF 0x00000000
+#define FAN_CLASS_CONTENT 0x00000004
+#define FAN_CLASS_PRE_CONTENT 0x00000008
+
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \
+ FAN_CLASS_PRE_CONTENT)
+
+#define FAN_UNLIMITED_QUEUE 0x00000010
+#define FAN_UNLIMITED_MARKS 0x00000020
+#define FAN_ENABLE_AUDIT 0x00000040
+
+/* Flags to determine fanotify event format */
+#define FAN_REPORT_PIDFD 0x00000080 /* Report pidfd for event->pid */
+#define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */
+#define FAN_REPORT_FID 0x00000200 /* Report unique file id */
+#define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */
+#define FAN_REPORT_NAME 0x00000800 /* Report events with name */
+#define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */
+#define FAN_REPORT_FD_ERROR 0x00002000 /* event->fd can report error */
+#define FAN_REPORT_MNT 0x00004000 /* Report mount events */
+
+/* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
+#define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
+/* Convenience macro - FAN_REPORT_TARGET_FID requires all other FID flags */
+#define FAN_REPORT_DFID_NAME_TARGET (FAN_REPORT_DFID_NAME | \
+ FAN_REPORT_FID | FAN_REPORT_TARGET_FID)
+
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \
+ FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
+ FAN_UNLIMITED_MARKS)
+
+/* flags used for fanotify_modify_mark() */
+#define FAN_MARK_ADD 0x00000001
+#define FAN_MARK_REMOVE 0x00000002
+#define FAN_MARK_DONT_FOLLOW 0x00000004
+#define FAN_MARK_ONLYDIR 0x00000008
+/* FAN_MARK_MOUNT is 0x00000010 */
+#define FAN_MARK_IGNORED_MASK 0x00000020
+#define FAN_MARK_IGNORED_SURV_MODIFY 0x00000040
+#define FAN_MARK_FLUSH 0x00000080
+/* FAN_MARK_FILESYSTEM is 0x00000100 */
+#define FAN_MARK_EVICTABLE 0x00000200
+/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */
+#define FAN_MARK_IGNORE 0x00000400
+
+/* These are NOT bitwise flags. Both bits can be used togther. */
+#define FAN_MARK_INODE 0x00000000
+#define FAN_MARK_MOUNT 0x00000010
+#define FAN_MARK_FILESYSTEM 0x00000100
+#define FAN_MARK_MNTNS 0x00000110
+
+/*
+ * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY
+ * for non-inode mark types.
+ */
+#define FAN_MARK_IGNORE_SURV (FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY)
+
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_MARK_FLAGS (FAN_MARK_ADD |\
+ FAN_MARK_REMOVE |\
+ FAN_MARK_DONT_FOLLOW |\
+ FAN_MARK_ONLYDIR |\
+ FAN_MARK_MOUNT |\
+ FAN_MARK_IGNORED_MASK |\
+ FAN_MARK_IGNORED_SURV_MODIFY |\
+ FAN_MARK_FLUSH)
+
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_EVENTS (FAN_ACCESS |\
+ FAN_MODIFY |\
+ FAN_CLOSE |\
+ FAN_OPEN)
+
+/*
+ * All events which require a permission response from userspace
+ */
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_PERM_EVENTS (FAN_OPEN_PERM |\
+ FAN_ACCESS_PERM)
+
+/* Deprecated - do not use this in programs and do not add new flags here! */
+#define FAN_ALL_OUTGOING_EVENTS (FAN_ALL_EVENTS |\
+ FAN_ALL_PERM_EVENTS |\
+ FAN_Q_OVERFLOW)
+
+#define FANOTIFY_METADATA_VERSION 3
+
+struct fanotify_event_metadata {
+ __u32 event_len;
+ __u8 vers;
+ __u8 reserved;
+ __u16 metadata_len;
+ __aligned_u64 mask;
+ __s32 fd;
+ __s32 pid;
+};
+
+#define FAN_EVENT_INFO_TYPE_FID 1
+#define FAN_EVENT_INFO_TYPE_DFID_NAME 2
+#define FAN_EVENT_INFO_TYPE_DFID 3
+#define FAN_EVENT_INFO_TYPE_PIDFD 4
+#define FAN_EVENT_INFO_TYPE_ERROR 5
+#define FAN_EVENT_INFO_TYPE_RANGE 6
+#define FAN_EVENT_INFO_TYPE_MNT 7
+
+/* Special info types for FAN_RENAME */
+#define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10
+/* Reserved for FAN_EVENT_INFO_TYPE_OLD_DFID 11 */
+#define FAN_EVENT_INFO_TYPE_NEW_DFID_NAME 12
+/* Reserved for FAN_EVENT_INFO_TYPE_NEW_DFID 13 */
+
+/* Variable length info record following event metadata */
+struct fanotify_event_info_header {
+ __u8 info_type;
+ __u8 pad;
+ __u16 len;
+};
+
+/*
+ * Unique file identifier info record.
+ * This structure is used for records of types FAN_EVENT_INFO_TYPE_FID,
+ * FAN_EVENT_INFO_TYPE_DFID and FAN_EVENT_INFO_TYPE_DFID_NAME.
+ * For FAN_EVENT_INFO_TYPE_DFID_NAME there is additionally a null terminated
+ * name immediately after the file handle.
+ */
+struct fanotify_event_info_fid {
+ struct fanotify_event_info_header hdr;
+ __kernel_fsid_t fsid;
+ /*
+ * Following is an opaque struct file_handle that can be passed as
+ * an argument to open_by_handle_at(2).
+ */
+ unsigned char handle[];
+};
+
+/*
+ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
+ * It holds a pidfd for the pid that was responsible for generating an event.
+ */
+struct fanotify_event_info_pidfd {
+ struct fanotify_event_info_header hdr;
+ __s32 pidfd;
+};
+
+struct fanotify_event_info_error {
+ struct fanotify_event_info_header hdr;
+ __s32 error;
+ __u32 error_count;
+};
+
+struct fanotify_event_info_range {
+ struct fanotify_event_info_header hdr;
+ __u32 pad;
+ __u64 offset;
+ __u64 count;
+};
+
+struct fanotify_event_info_mnt {
+ struct fanotify_event_info_header hdr;
+ __u64 mnt_id;
+};
+
+/*
+ * User space may need to record additional information about its decision.
+ * The extra information type records what kind of information is included.
+ * The default is none. We also define an extra information buffer whose
+ * size is determined by the extra information type.
+ *
+ * If the information type is Audit Rule, then the information following
+ * is the rule number that triggered the user space decision that
+ * requires auditing.
+ */
+
+#define FAN_RESPONSE_INFO_NONE 0
+#define FAN_RESPONSE_INFO_AUDIT_RULE 1
+
+struct fanotify_response {
+ __s32 fd;
+ __u32 response;
+};
+
+struct fanotify_response_info_header {
+ __u8 type;
+ __u8 pad;
+ __u16 len;
+};
+
+struct fanotify_response_info_audit_rule {
+ struct fanotify_response_info_header hdr;
+ __u32 rule_number;
+ __u32 subj_trust;
+ __u32 obj_trust;
+};
+
+/* Legit userspace responses to a _PERM event */
+#define FAN_ALLOW 0x01
+#define FAN_DENY 0x02
+/* errno other than EPERM can specified in upper byte of deny response */
+#define FAN_ERRNO_BITS 8
+#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS)
+#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1)
+#define FAN_DENY_ERRNO(err) \
+ (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT))
+
+#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */
+#define FAN_INFO 0x20 /* Bitmask to indicate additional information */
+
+/* No fd set in event */
+#define FAN_NOFD -1
+#define FAN_NOPIDFD FAN_NOFD
+#define FAN_EPIDFD -2
+
+/* Helper functions to deal with fanotify_event_metadata buffers */
+#define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
+
+#define FAN_EVENT_NEXT(meta, len) ((len) -= (meta)->event_len, \
+ (struct fanotify_event_metadata*)(((char *)(meta)) + \
+ (meta)->event_len))
+
+#define FAN_EVENT_OK(meta, len) ((long)(len) >= (long)FAN_EVENT_METADATA_LEN && \
+ (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
+ (long)(meta)->event_len <= (long)(len))
+
+#endif /* _UAPI_LINUX_FANOTIFY_H */
diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h
new file mode 100644
index 000000000000..7fa67c2031a5
--- /dev/null
+++ b/tools/include/uapi/linux/mount.h
@@ -0,0 +1,235 @@
+#ifndef _UAPI_LINUX_MOUNT_H
+#define _UAPI_LINUX_MOUNT_H
+
+#include <linux/types.h>
+
+/*
+ * These are the fs-independent mount-flags: up to 32 flags are supported
+ *
+ * Usage of these is restricted within the kernel to core mount(2) code and
+ * callers of sys_mount() only. Filesystems should be using the SB_*
+ * equivalent instead.
+ */
+#define MS_RDONLY 1 /* Mount read-only */
+#define MS_NOSUID 2 /* Ignore suid and sgid bits */
+#define MS_NODEV 4 /* Disallow access to device special files */
+#define MS_NOEXEC 8 /* Disallow program execution */
+#define MS_SYNCHRONOUS 16 /* Writes are synced at once */
+#define MS_REMOUNT 32 /* Alter flags of a mounted FS */
+#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */
+#define MS_DIRSYNC 128 /* Directory modifications are synchronous */
+#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */
+#define MS_NOATIME 1024 /* Do not update access times. */
+#define MS_NODIRATIME 2048 /* Do not update directory access times */
+#define MS_BIND 4096
+#define MS_MOVE 8192
+#define MS_REC 16384
+#define MS_VERBOSE 32768 /* War is peace. Verbosity is silence.
+ MS_VERBOSE is deprecated. */
+#define MS_SILENT 32768
+#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
+#define MS_UNBINDABLE (1<<17) /* change to unbindable */
+#define MS_PRIVATE (1<<18) /* change to private */
+#define MS_SLAVE (1<<19) /* change to slave */
+#define MS_SHARED (1<<20) /* change to shared */
+#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
+#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
+#define MS_I_VERSION (1<<23) /* Update inode I_version field */
+#define MS_STRICTATIME (1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define MS_SUBMOUNT (1<<26)
+#define MS_NOREMOTELOCK (1<<27)
+#define MS_NOSEC (1<<28)
+#define MS_BORN (1<<29)
+#define MS_ACTIVE (1<<30)
+#define MS_NOUSER (1<<31)
+
+/*
+ * Superblock flags that can be altered by MS_REMOUNT
+ */
+#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+ MS_LAZYTIME)
+
+/*
+ * Old magic mount flag and mask
+ */
+#define MS_MGC_VAL 0xC0ED0000
+#define MS_MGC_MSK 0xffff0000
+
+/*
+ * open_tree() flags.
+ */
+#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
+
+/*
+ * move_mount() flags.
+ */
+#define MOVE_MOUNT_F_SYMLINKS 0x00000001 /* Follow symlinks on from path */
+#define MOVE_MOUNT_F_AUTOMOUNTS 0x00000002 /* Follow automounts on from path */
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#define MOVE_MOUNT_T_SYMLINKS 0x00000010 /* Follow symlinks on to path */
+#define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */
+#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */
+#define MOVE_MOUNT__MASK 0x00000377
+
+/*
+ * fsopen() flags.
+ */
+#define FSOPEN_CLOEXEC 0x00000001
+
+/*
+ * fspick() flags.
+ */
+#define FSPICK_CLOEXEC 0x00000001
+#define FSPICK_SYMLINK_NOFOLLOW 0x00000002
+#define FSPICK_NO_AUTOMOUNT 0x00000004
+#define FSPICK_EMPTY_PATH 0x00000008
+
+/*
+ * The type of fsconfig() call made.
+ */
+enum fsconfig_command {
+ FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */
+ FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */
+ FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */
+ FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */
+ FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */
+ FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */
+ FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */
+ FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */
+ FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */
+};
+
+/*
+ * fsmount() flags.
+ */
+#define FSMOUNT_CLOEXEC 0x00000001
+
+/*
+ * Mount attributes.
+ */
+#define MOUNT_ATTR_RDONLY 0x00000001 /* Mount read-only */
+#define MOUNT_ATTR_NOSUID 0x00000002 /* Ignore suid and sgid bits */
+#define MOUNT_ATTR_NODEV 0x00000004 /* Disallow access to device special files */
+#define MOUNT_ATTR_NOEXEC 0x00000008 /* Disallow program execution */
+#define MOUNT_ATTR__ATIME 0x00000070 /* Setting on how atime should be updated */
+#define MOUNT_ATTR_RELATIME 0x00000000 /* - Update atime relative to mtime/ctime. */
+#define MOUNT_ATTR_NOATIME 0x00000010 /* - Do not update access times. */
+#define MOUNT_ATTR_STRICTATIME 0x00000020 /* - Always perform atime updates */
+#define MOUNT_ATTR_NODIRATIME 0x00000080 /* Do not update directory access times */
+#define MOUNT_ATTR_IDMAP 0x00100000 /* Idmap mount to @userns_fd in struct mount_attr. */
+#define MOUNT_ATTR_NOSYMFOLLOW 0x00200000 /* Do not follow symlinks */
+
+/*
+ * mount_setattr()
+ */
+struct mount_attr {
+ __u64 attr_set;
+ __u64 attr_clr;
+ __u64 propagation;
+ __u64 userns_fd;
+};
+
+/* List of all mount_attr versions. */
+#define MOUNT_ATTR_SIZE_VER0 32 /* sizeof first published struct */
+
+
+/*
+ * Structure for getting mount/superblock/filesystem info with statmount(2).
+ *
+ * The interface is similar to statx(2): individual fields or groups can be
+ * selected with the @mask argument of statmount(). Kernel will set the @mask
+ * field according to the supported fields.
+ *
+ * If string fields are selected, then the caller needs to pass a buffer that
+ * has space after the fixed part of the structure. Nul terminated strings are
+ * copied there and offsets relative to @str are stored in the relevant fields.
+ * If the buffer is too small, then EOVERFLOW is returned. The actually used
+ * size is returned in @size.
+ */
+struct statmount {
+ __u32 size; /* Total size, including strings */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
+ __u64 mask; /* What results were written */
+ __u32 sb_dev_major; /* Device ID */
+ __u32 sb_dev_minor;
+ __u64 sb_magic; /* ..._SUPER_MAGIC */
+ __u32 sb_flags; /* SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
+ __u32 fs_type; /* [str] Filesystem type */
+ __u64 mnt_id; /* Unique ID of mount */
+ __u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
+ __u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
+ __u32 mnt_parent_id_old;
+ __u64 mnt_attr; /* MOUNT_ATTR_... */
+ __u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
+ __u64 mnt_peer_group; /* ID of shared peer group */
+ __u64 mnt_master; /* Mount receives propagation from this ID */
+ __u64 propagate_from; /* Propagation from in current namespace */
+ __u32 mnt_root; /* [str] Root of mount relative to root of fs */
+ __u32 mnt_point; /* [str] Mountpoint relative to current root */
+ __u64 mnt_ns_id; /* ID of the mount namespace */
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u64 supported_mask; /* Mask flags that this kernel supports */
+ __u32 mnt_uidmap_num; /* Number of uid mappings */
+ __u32 mnt_uidmap; /* [str] Array of uid mappings (as seen from callers namespace) */
+ __u32 mnt_gidmap_num; /* Number of gid mappings */
+ __u32 mnt_gidmap; /* [str] Array of gid mappings (as seen from callers namespace) */
+ __u64 __spare2[43];
+ char str[]; /* Variable size part containing strings */
+};
+
+/*
+ * Structure for passing mount ID and miscellaneous parameters to statmount(2)
+ * and listmount(2).
+ *
+ * For statmount(2) @param represents the request mask.
+ * For listmount(2) @param represents the last listed mount id (or zero).
+ */
+struct mnt_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 mnt_id;
+ __u64 param;
+ __u64 mnt_ns_id;
+};
+
+/* List of all mnt_id_req versions. */
+#define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */
+#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
+
+/*
+ * @mask bits for statmount(2)
+ */
+#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */
+#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
+#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
+#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
+#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */
+#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
+#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
+#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
+#define STATMOUNT_SUPPORTED_MASK 0x00001000U /* Want/got supported mask flags */
+#define STATMOUNT_MNT_UIDMAP 0x00002000U /* Want/got uidmap... */
+#define STATMOUNT_MNT_GIDMAP 0x00004000U /* Want/got gidmap... */
+
+/*
+ * Special @mnt_id values that can be passed to listmount
+ */
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */
+
+#endif /* _UAPI_LINUX_MOUNT_H */
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
new file mode 100644
index 000000000000..34127653fd00
--- /dev/null
+++ b/tools/include/uapi/linux/nsfs.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_NSFS_H
+#define __LINUX_NSFS_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define NSIO 0xb7
+
+/* Returns a file descriptor that refers to an owning user namespace */
+#define NS_GET_USERNS _IO(NSIO, 0x1)
+/* Returns a file descriptor that refers to a parent namespace */
+#define NS_GET_PARENT _IO(NSIO, 0x2)
+/* Returns the type of namespace (CLONE_NEW* value) referred to by
+ file descriptor */
+#define NS_GET_NSTYPE _IO(NSIO, 0x3)
+/* Get owner UID (in the caller's user namespace) for a user namespace */
+#define NS_GET_OWNER_UID _IO(NSIO, 0x4)
+/* Get the id for a mount namespace */
+#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64)
+/* Translate pid from target pid namespace into the caller's pid namespace. */
+#define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int)
+/* Return thread-group leader id of pid in the callers pid namespace. */
+#define NS_GET_TGID_FROM_PIDNS _IOR(NSIO, 0x7, int)
+/* Translate pid from caller's pid namespace into a target pid namespace. */
+#define NS_GET_PID_IN_PIDNS _IOR(NSIO, 0x8, int)
+/* Return thread-group leader id of pid in the target pid namespace. */
+#define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int)
+
+struct mnt_ns_info {
+ __u32 size;
+ __u32 nr_mounts;
+ __u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(NSIO, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info)
+
+#endif /* __LINUX_NSFS_H */
diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/pyynl/ethtool.py
index af7fddd7b085..cab6b576c876 100755
--- a/tools/net/ynl/pyynl/ethtool.py
+++ b/tools/net/ynl/pyynl/ethtool.py
@@ -338,16 +338,24 @@ def main():
print('Capabilities:')
[print(f'\t{v}') for v in bits_to_dict(tsinfo['timestamping'])]
- print(f'PTP Hardware Clock: {tsinfo["phc-index"]}')
+ print(f'PTP Hardware Clock: {tsinfo.get("phc-index", "none")}')
- print('Hardware Transmit Timestamp Modes:')
- [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
+ if 'tx-types' in tsinfo:
+ print('Hardware Transmit Timestamp Modes:')
+ [print(f'\t{v}') for v in bits_to_dict(tsinfo['tx-types'])]
+ else:
+ print('Hardware Transmit Timestamp Modes: none')
+
+ if 'rx-filters' in tsinfo:
+ print('Hardware Receive Filter Modes:')
+ [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
+ else:
+ print('Hardware Receive Filter Modes: none')
- print('Hardware Receive Filter Modes:')
- [print(f'\t{v}') for v in bits_to_dict(tsinfo['rx-filters'])]
+ if 'stats' in tsinfo and tsinfo['stats']:
+ print('Statistics:')
+ [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
- print('Statistics:')
- [print(f'\t{k}: {v}') for k, v in tsinfo['stats'].items()]
return
print(f'Settings for {args.device}:')
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 30c0a34b2784..a7f08edbc235 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -1143,10 +1143,9 @@ class Family(SpecFamily):
self.pure_nested_structs[nested].request = True
if attr in rs_members['reply']:
self.pure_nested_structs[nested].reply = True
-
- if spec.is_multi_val():
- child = self.pure_nested_structs.get(nested)
- child.in_multi_val = True
+ if spec.is_multi_val():
+ child = self.pure_nested_structs.get(nested)
+ child.in_multi_val = True
self._sort_pure_types()
diff --git a/tools/testing/selftests/bpf/config.aarch64 b/tools/testing/selftests/bpf/config.aarch64
index 3720b7611523..e1495a4bbc99 100644
--- a/tools/testing/selftests/bpf/config.aarch64
+++ b/tools/testing/selftests/bpf/config.aarch64
@@ -158,7 +158,6 @@ CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_TUN=y
CONFIG_UNIX=y
CONFIG_UPROBES=y
-CONFIG_USELIB=y
CONFIG_USER_NS=y
CONFIG_VETH=y
CONFIG_VLAN_8021Q=y
diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x
index 706931a8c2c6..26c3bc2ce11d 100644
--- a/tools/testing/selftests/bpf/config.s390x
+++ b/tools/testing/selftests/bpf/config.s390x
@@ -128,7 +128,6 @@ CONFIG_TRANSPARENT_HUGEPAGE=y
CONFIG_TUN=y
CONFIG_UNIX=y
CONFIG_UPROBES=y
-CONFIG_USELIB=y
CONFIG_USER_NS=y
CONFIG_VETH=y
CONFIG_VLAN_8021Q=y
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index 137b2364a082..9984413be9f0 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -1,14 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <fcntl.h>
+#include <inttypes.h>
#include <libgen.h>
#include <linux/limits.h>
#include <pthread.h>
#include <string.h>
+#include <sys/mount.h>
#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
#include <unistd.h>
#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
@@ -35,6 +41,7 @@ static void crashing_child(void)
FIXTURE(coredump)
{
char original_core_pattern[256];
+ pid_t pid_coredump_server;
};
FIXTURE_SETUP(coredump)
@@ -44,6 +51,7 @@ FIXTURE_SETUP(coredump)
char *dir;
int ret;
+ self->pid_coredump_server = -ESRCH;
file = fopen("/proc/sys/kernel/core_pattern", "r");
ASSERT_NE(NULL, file);
@@ -61,10 +69,17 @@ FIXTURE_TEARDOWN(coredump)
{
const char *reason;
FILE *file;
- int ret;
+ int ret, status;
unlink(STACKDUMP_FILE);
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
file = fopen("/proc/sys/kernel/core_pattern", "w");
if (!file) {
reason = "Unable to open core_pattern";
@@ -89,14 +104,14 @@ fail:
fprintf(stderr, "Failed to cleanup stackdump test: %s\n", reason);
}
-TEST_F(coredump, stackdump)
+TEST_F_TIMEOUT(coredump, stackdump, 120)
{
struct sigaction action = {};
unsigned long long stack;
char *test_dir, *line;
size_t line_length;
char buf[PATH_MAX];
- int ret, i;
+ int ret, i, status;
FILE *file;
pid_t pid;
@@ -129,6 +144,10 @@ TEST_F(coredump, stackdump)
/*
* Step 3: Wait for the stackdump script to write the stack pointers to the stackdump file
*/
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
for (i = 0; i < 10; ++i) {
file = fopen(STACKDUMP_FILE, "r");
if (file)
@@ -138,14 +157,466 @@ TEST_F(coredump, stackdump)
ASSERT_NE(file, NULL);
/* Step 4: Make sure all stack pointer values are non-zero */
+ line = NULL;
for (i = 0; -1 != getline(&line, &line_length, file); ++i) {
stack = strtoull(line, NULL, 10);
ASSERT_NE(stack, 0);
}
+ free(line);
ASSERT_EQ(i, 1 + NUM_THREAD_SPAWN);
fclose(file);
}
+TEST_F(coredump, socket)
+{
+ int fd, pidfd, ret, status;
+ FILE *file;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ char core_file[PATH_MAX];
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ ASSERT_NE(file, NULL);
+
+ ret = fprintf(file, "@/tmp/coredump.socket");
+ ASSERT_EQ(ret, strlen("@/tmp/coredump.socket"));
+ ASSERT_EQ(fclose(file), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server, fd_coredump, fd_peer_pidfd, fd_core_file;
+ socklen_t fd_peer_pidfd_len;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0)
+ _exit(EXIT_FAILURE);
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to bind coredump socket\n");
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = listen(fd_server, 1);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to listen on coredump socket\n");
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "Failed to accept coredump socket connection\n");
+ close(fd_server);
+ _exit(EXIT_FAILURE);
+ }
+
+ fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
+ ret = getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD,
+ &fd_peer_pidfd, &fd_peer_pidfd_len);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n");
+ close(fd_coredump);
+ close(fd_server);
+ _exit(EXIT_FAILURE);
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, &info);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to retrieve pidfd info from peer pidfd for coredump socket connection\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "Missing coredump information from coredumping task\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "Received connection from non-coredumping task\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "Failed to create coredump file\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ close(fd_core_file);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ close(fd_core_file);
+ _exit(EXIT_FAILURE);
+ }
+ }
+
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ close(fd_core_file);
+ _exit(EXIT_SUCCESS);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+ /*
+ * We should somehow validate the produced core file.
+ * For now just allow for visual inspection
+ */
+ system("file /tmp/coredump.file");
+}
+
+TEST_F(coredump, socket_detect_userspace_client)
+{
+ int fd, pidfd, ret, status;
+ FILE *file;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ char core_file[PATH_MAX];
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ ASSERT_NE(file, NULL);
+
+ ret = fprintf(file, "@/tmp/coredump.socket");
+ ASSERT_EQ(ret, strlen("@/tmp/coredump.socket"));
+ ASSERT_EQ(fclose(file), 0);
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server, fd_coredump, fd_peer_pidfd, fd_core_file;
+ socklen_t fd_peer_pidfd_len;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0)
+ _exit(EXIT_FAILURE);
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to bind coredump socket\n");
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = listen(fd_server, 1);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to listen on coredump socket\n");
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "Failed to accept coredump socket connection\n");
+ close(fd_server);
+ _exit(EXIT_FAILURE);
+ }
+
+ fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
+ ret = getsockopt(fd_coredump, SOL_SOCKET, SO_PEERPIDFD,
+ &fd_peer_pidfd, &fd_peer_pidfd_len);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n");
+ close(fd_coredump);
+ close(fd_server);
+ _exit(EXIT_FAILURE);
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, &info);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to retrieve pidfd info from peer pidfd for coredump socket connection\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "Missing coredump information from coredumping task\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (info.coredump_mask & PIDFD_COREDUMPED) {
+ fprintf(stderr, "Received unexpected connection from coredumping task\n");
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ _exit(EXIT_FAILURE);
+ }
+
+ close(fd_coredump);
+ close(fd_server);
+ close(fd_peer_pidfd);
+ close(fd_core_file);
+ _exit(EXIT_SUCCESS);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ int fd_socket;
+ ssize_t ret;
+
+ fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd_socket < 0)
+ _exit(EXIT_FAILURE);
+
+
+ ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0)
+ _exit(EXIT_FAILURE);
+
+ (void *)write(fd_socket, &(char){ 0 }, 1);
+ close(fd_socket);
+ _exit(EXIT_SUCCESS);
+ }
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_EQ(errno, ENOENT);
+}
+
+TEST_F(coredump, socket_enoent)
+{
+ int pidfd, ret, status;
+ FILE *file;
+ pid_t pid;
+ char core_file[PATH_MAX];
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ ASSERT_NE(file, NULL);
+
+ ret = fprintf(file, "@/tmp/coredump.socket");
+ ASSERT_EQ(ret, strlen("@/tmp/coredump.socket"));
+ ASSERT_EQ(fclose(file), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+}
+
+TEST_F(coredump, socket_no_listener)
+{
+ int pidfd, ret, status;
+ FILE *file;
+ pid_t pid, pid_coredump_server;
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ ASSERT_NE(file, NULL);
+
+ ret = fprintf(file, "@/tmp/coredump.socket");
+ ASSERT_EQ(ret, strlen("@/tmp/coredump.socket"));
+ ASSERT_EQ(fclose(file), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server;
+ socklen_t fd_peer_pidfd_len;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0)
+ _exit(EXIT_FAILURE);
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to bind coredump socket\n");
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_FAILURE);
+ }
+
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(EXIT_SUCCESS);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
index 2bf14ac2b8c6..9d48004ff1a1 100644
--- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c
+++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c
@@ -431,6 +431,22 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6)
return 0;
}
+static struct netdev_queue_id *create_queues(void)
+{
+ struct netdev_queue_id *queues;
+ size_t i = 0;
+
+ queues = calloc(num_queues, sizeof(*queues));
+ for (i = 0; i < num_queues; i++) {
+ queues[i]._present.type = 1;
+ queues[i]._present.id = 1;
+ queues[i].type = NETDEV_QUEUE_TYPE_RX;
+ queues[i].id = start_queue + i;
+ }
+
+ return queues;
+}
+
int do_server(struct memory_buffer *mem)
{
char ctrl_data[sizeof(int) * 20000];
@@ -448,7 +464,6 @@ int do_server(struct memory_buffer *mem)
char buffer[256];
int socket_fd;
int client_fd;
- size_t i = 0;
int ret;
ret = parse_address(server_ip, atoi(port), &server_sin);
@@ -471,16 +486,7 @@ int do_server(struct memory_buffer *mem)
sleep(1);
- queues = malloc(sizeof(*queues) * num_queues);
-
- for (i = 0; i < num_queues; i++) {
- queues[i]._present.type = 1;
- queues[i]._present.id = 1;
- queues[i].type = NETDEV_QUEUE_TYPE_RX;
- queues[i].id = start_queue + i;
- }
-
- if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+ if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys))
error(1, 0, "Failed to bind\n");
tmp_mem = malloc(mem->size);
@@ -545,7 +551,6 @@ int do_server(struct memory_buffer *mem)
goto cleanup;
}
- i++;
for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
if (cm->cmsg_level != SOL_SOCKET ||
(cm->cmsg_type != SCM_DEVMEM_DMABUF &&
@@ -630,10 +635,8 @@ cleanup:
void run_devmem_tests(void)
{
- struct netdev_queue_id *queues;
struct memory_buffer *mem;
struct ynl_sock *ys;
- size_t i = 0;
mem = provider->alloc(getpagesize() * NUM_PAGES);
@@ -641,38 +644,24 @@ void run_devmem_tests(void)
if (configure_rss())
error(1, 0, "rss error\n");
- queues = calloc(num_queues, sizeof(*queues));
-
if (configure_headersplit(1))
error(1, 0, "Failed to configure header split\n");
- if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+ if (!bind_rx_queue(ifindex, mem->fd,
+ calloc(num_queues, sizeof(struct netdev_queue_id)),
+ num_queues, &ys))
error(1, 0, "Binding empty queues array should have failed\n");
- for (i = 0; i < num_queues; i++) {
- queues[i]._present.type = 1;
- queues[i]._present.id = 1;
- queues[i].type = NETDEV_QUEUE_TYPE_RX;
- queues[i].id = start_queue + i;
- }
-
if (configure_headersplit(0))
error(1, 0, "Failed to configure header split\n");
- if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+ if (!bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys))
error(1, 0, "Configure dmabuf with header split off should have failed\n");
if (configure_headersplit(1))
error(1, 0, "Failed to configure header split\n");
- for (i = 0; i < num_queues; i++) {
- queues[i]._present.type = 1;
- queues[i]._present.id = 1;
- queues[i].type = NETDEV_QUEUE_TYPE_RX;
- queues[i].id = start_queue + i;
- }
-
- if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys))
+ if (bind_rx_queue(ifindex, mem->fd, create_queues(), num_queues, &ys))
error(1, 0, "Failed to bind\n");
/* Deactivating a bound queue should not be legal */
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 828b66a10c63..7afa58e2bb20 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -2,3 +2,4 @@
dnotify_test
devpts_pts
file_stressor
+anon_inode_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 66305fc34c60..b02326193fee 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/anon_inode_test.c b/tools/testing/selftests/filesystems/anon_inode_test.c
new file mode 100644
index 000000000000..e8e0ef1460d2
--- /dev/null
+++ b/tools/testing/selftests/filesystems/anon_inode_test.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+#include "overlayfs/wrappers.h"
+
+TEST(anon_inode_no_chown)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(fchown(fd_context, 1234, 5678), 0);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_chmod)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(fchmod(fd_context, 0777), 0);
+ ASSERT_EQ(errno, EOPNOTSUPP);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_exec)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_LT(execveat(fd_context, "", NULL, NULL, AT_EMPTY_PATH), 0);
+ ASSERT_EQ(errno, EACCES);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST(anon_inode_no_open)
+{
+ int fd_context;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_GE(dup2(fd_context, 500), 0);
+ ASSERT_EQ(close(fd_context), 0);
+ fd_context = 500;
+
+ ASSERT_LT(open("/proc/self/fd/500", 0), 0);
+ ASSERT_EQ(errno, ENXIO);
+
+ EXPECT_EQ(close(fd_context), 0);
+}
+
+TEST_HARNESS_MAIN
+
diff --git a/tools/testing/selftests/filesystems/mount-notify/.gitignore b/tools/testing/selftests/filesystems/mount-notify/.gitignore
index 82a4846cbc4b..124339ea7845 100644
--- a/tools/testing/selftests/filesystems/mount-notify/.gitignore
+++ b/tools/testing/selftests/filesystems/mount-notify/.gitignore
@@ -1,2 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
/*_test
+/*_test_ns
diff --git a/tools/testing/selftests/filesystems/mount-notify/Makefile b/tools/testing/selftests/filesystems/mount-notify/Makefile
index 10be0227b5ae..836a4eb7be06 100644
--- a/tools/testing/selftests/filesystems/mount-notify/Makefile
+++ b/tools/testing/selftests/filesystems/mount-notify/Makefile
@@ -1,6 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-or-later
-CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS := mount-notify_test
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := mount-notify_test mount-notify_test_ns
include ../../lib.mk
+
+$(OUTPUT)/mount-notify_test: ../utils.c
+$(OUTPUT)/mount-notify_test_ns: ../utils.c
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
index 59a71f22fb11..63ce708d93ed 100644
--- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c
@@ -8,43 +8,21 @@
#include <string.h>
#include <sys/stat.h>
#include <sys/mount.h>
-#include <linux/fanotify.h>
#include <unistd.h>
-#include <sys/fanotify.h>
#include <sys/syscall.h>
#include "../../kselftest_harness.h"
#include "../statmount/statmount.h"
+#include "../utils.h"
-#ifndef FAN_MNT_ATTACH
-struct fanotify_event_info_mnt {
- struct fanotify_event_info_header hdr;
- __u64 mnt_id;
-};
-#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */
-#endif
-
-#ifndef FAN_MNT_DETACH
-#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */
-#endif
-
-#ifndef FAN_REPORT_MNT
-#define FAN_REPORT_MNT 0x00004000 /* Report mount events */
+// Needed for linux/fanotify.h
+#ifndef __kernel_fsid_t
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
#endif
-#ifndef FAN_MARK_MNTNS
-#define FAN_MARK_MNTNS 0x00000110
-#endif
-
-static uint64_t get_mnt_id(struct __test_metadata *const _metadata,
- const char *path)
-{
- struct statx sx;
-
- ASSERT_EQ(statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx), 0);
- ASSERT_TRUE(!!(sx.stx_mask & STATX_MNT_ID_UNIQUE));
- return sx.stx_mnt_id;
-}
+#include <sys/fanotify.h>
static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
@@ -94,7 +72,7 @@ FIXTURE_SETUP(fanotify)
ASSERT_EQ(mkdir("b", 0700), 0);
- self->root_id = get_mnt_id(_metadata, "/");
+ self->root_id = get_unique_mnt_id("/");
ASSERT_NE(self->root_id, 0);
for (i = 0; i < NUM_FAN_FDS; i++) {
diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
new file mode 100644
index 000000000000..090a5ca65004
--- /dev/null
+++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test_ns.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "../../kselftest_harness.h"
+#include "../../pidfd/pidfd.h"
+#include "../statmount/statmount.h"
+#include "../utils.h"
+
+// Needed for linux/fanotify.h
+#ifndef __kernel_fsid_t
+typedef struct {
+ int val[2];
+} __kernel_fsid_t;
+#endif
+
+#include <sys/fanotify.h>
+
+static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX";
+
+static const int mark_types[] = {
+ FAN_MARK_FILESYSTEM,
+ FAN_MARK_MOUNT,
+ FAN_MARK_INODE
+};
+
+static const int mark_cmds[] = {
+ FAN_MARK_ADD,
+ FAN_MARK_REMOVE,
+ FAN_MARK_FLUSH
+};
+
+#define NUM_FAN_FDS ARRAY_SIZE(mark_cmds)
+
+FIXTURE(fanotify) {
+ int fan_fd[NUM_FAN_FDS];
+ char buf[256];
+ unsigned int rem;
+ void *next;
+ char root_mntpoint[sizeof(root_mntpoint_templ)];
+ int orig_root;
+ int orig_ns_fd;
+ int ns_fd;
+ uint64_t root_id;
+};
+
+FIXTURE_SETUP(fanotify)
+{
+ int i, ret;
+
+ self->orig_ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(self->orig_ns_fd, 0);
+
+ ret = setup_userns();
+ ASSERT_EQ(ret, 0);
+
+ self->ns_fd = open("/proc/self/ns/mnt", O_RDONLY);
+ ASSERT_GE(self->ns_fd, 0);
+
+ strcpy(self->root_mntpoint, root_mntpoint_templ);
+ ASSERT_NE(mkdtemp(self->root_mntpoint), NULL);
+
+ self->orig_root = open("/", O_PATH | O_CLOEXEC);
+ ASSERT_GE(self->orig_root, 0);
+
+ ASSERT_EQ(mount("tmpfs", self->root_mntpoint, "tmpfs", 0, NULL), 0);
+
+ ASSERT_EQ(chroot(self->root_mntpoint), 0);
+
+ ASSERT_EQ(chdir("/"), 0);
+
+ ASSERT_EQ(mkdir("a", 0700), 0);
+
+ ASSERT_EQ(mkdir("b", 0700), 0);
+
+ self->root_id = get_unique_mnt_id("/");
+ ASSERT_NE(self->root_id, 0);
+
+ for (i = 0; i < NUM_FAN_FDS; i++) {
+ int fan_fd = fanotify_init(FAN_REPORT_FID, 0);
+ // Verify that watching tmpfs mounted inside userns is allowed
+ ret = fanotify_mark(fan_fd, FAN_MARK_ADD | mark_types[i],
+ FAN_OPEN, AT_FDCWD, "/");
+ ASSERT_EQ(ret, 0);
+ // ...but watching entire orig root filesystem is not allowed
+ ret = fanotify_mark(fan_fd, FAN_MARK_ADD | FAN_MARK_FILESYSTEM,
+ FAN_OPEN, self->orig_root, ".");
+ ASSERT_NE(ret, 0);
+ close(fan_fd);
+
+ self->fan_fd[i] = fanotify_init(FAN_REPORT_MNT | FAN_NONBLOCK,
+ 0);
+ ASSERT_GE(self->fan_fd[i], 0);
+ // Verify that watching mntns where group was created is allowed
+ ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ // ...but watching orig mntns is not allowed
+ ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->orig_ns_fd, NULL);
+ ASSERT_NE(ret, 0);
+ // On fd[0] we do an extra ADD that changes nothing.
+ // On fd[1]/fd[2] we REMOVE/FLUSH which removes the mark.
+ ret = fanotify_mark(self->fan_fd[i], mark_cmds[i] |
+ FAN_MARK_MNTNS,
+ FAN_MNT_ATTACH | FAN_MNT_DETACH,
+ self->ns_fd, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+
+ self->rem = 0;
+}
+
+FIXTURE_TEARDOWN(fanotify)
+{
+ int i;
+
+ ASSERT_EQ(self->rem, 0);
+ for (i = 0; i < NUM_FAN_FDS; i++)
+ close(self->fan_fd[i]);
+
+ ASSERT_EQ(fchdir(self->orig_root), 0);
+
+ ASSERT_EQ(chroot("."), 0);
+
+ EXPECT_EQ(umount2(self->root_mntpoint, MNT_DETACH), 0);
+ EXPECT_EQ(chdir(self->root_mntpoint), 0);
+ EXPECT_EQ(chdir("/"), 0);
+ EXPECT_EQ(rmdir(self->root_mntpoint), 0);
+}
+
+static uint64_t expect_notify(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t *mask)
+{
+ struct fanotify_event_metadata *meta;
+ struct fanotify_event_info_mnt *mnt;
+ unsigned int thislen;
+
+ if (!self->rem) {
+ ssize_t len;
+ int i;
+
+ for (i = NUM_FAN_FDS - 1; i >= 0; i--) {
+ len = read(self->fan_fd[i], self->buf,
+ sizeof(self->buf));
+ if (i > 0) {
+ // Groups 1,2 should get EAGAIN
+ ASSERT_EQ(len, -1);
+ ASSERT_EQ(errno, EAGAIN);
+ } else {
+ // Group 0 should get events
+ ASSERT_GT(len, 0);
+ }
+ }
+
+ self->rem = len;
+ self->next = (void *) self->buf;
+ }
+
+ meta = self->next;
+ ASSERT_TRUE(FAN_EVENT_OK(meta, self->rem));
+
+ thislen = meta->event_len;
+ self->rem -= thislen;
+ self->next += thislen;
+
+ *mask = meta->mask;
+ thislen -= sizeof(*meta);
+
+ mnt = ((void *) meta) + meta->event_len - thislen;
+
+ ASSERT_EQ(thislen, sizeof(*mnt));
+
+ return mnt->mnt_id;
+}
+
+static void expect_notify_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ unsigned int n, uint64_t mask[], uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify(_metadata, self, &mask[i]);
+}
+
+static uint64_t expect_notify_mask(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t expect_mask)
+{
+ uint64_t mntid, mask;
+
+ mntid = expect_notify(_metadata, self, &mask);
+ ASSERT_EQ(expect_mask, mask);
+
+ return mntid;
+}
+
+
+static void expect_notify_mask_n(struct __test_metadata *const _metadata,
+ FIXTURE_DATA(fanotify) *self,
+ uint64_t mask, unsigned int n, uint64_t mnts[])
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++)
+ mnts[i] = expect_notify_mask(_metadata, self, mask);
+}
+
+static void verify_mount_ids(struct __test_metadata *const _metadata,
+ const uint64_t list1[], const uint64_t list2[],
+ size_t num)
+{
+ unsigned int i, j;
+
+ // Check that neither list has any duplicates
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (i != j) {
+ ASSERT_NE(list1[i], list1[j]);
+ ASSERT_NE(list2[i], list2[j]);
+ }
+ }
+ }
+ // Check that all list1 memebers can be found in list2. Together with
+ // the above it means that the list1 and list2 represent the same sets.
+ for (i = 0; i < num; i++) {
+ for (j = 0; j < num; j++) {
+ if (list1[i] == list2[j])
+ break;
+ }
+ ASSERT_NE(j, num);
+ }
+}
+
+static void check_mounted(struct __test_metadata *const _metadata,
+ const uint64_t mnts[], size_t num)
+{
+ ssize_t ret;
+ uint64_t *list;
+
+ list = malloc((num + 1) * sizeof(list[0]));
+ ASSERT_NE(list, NULL);
+
+ ret = listmount(LSMT_ROOT, 0, 0, list, num + 1, 0);
+ ASSERT_EQ(ret, num);
+
+ verify_mount_ids(_metadata, mnts, list, num);
+
+ free(list);
+}
+
+static void setup_mount_tree(struct __test_metadata *const _metadata,
+ int log2_num)
+{
+ int ret, i;
+
+ ret = mount("", "/", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ for (i = 0; i < log2_num; i++) {
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+ }
+}
+
+TEST_F(fanotify, bind)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+
+ ret = mount("/", "/", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, move)
+{
+ int ret;
+ uint64_t mnts[2] = { self->root_id };
+ uint64_t move_id;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ ret = move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0);
+ ASSERT_EQ(ret, 0);
+
+ move_id = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ ASSERT_EQ(move_id, mnts[1]);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, propagate)
+{
+ const unsigned int log2_num = 4;
+ const unsigned int num = (1 << log2_num);
+ uint64_t mnts[num];
+
+ setup_mount_tree(_metadata, log2_num);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, num - 1, mnts + 1);
+
+ mnts[0] = self->root_id;
+ check_mounted(_metadata, mnts, num);
+
+ // Cleanup
+ int ret;
+ uint64_t mnts2[num];
+ ret = umount2("/", MNT_DETACH);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/", NULL, MS_PRIVATE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts2[0] = self->root_id;
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, num - 1, mnts2 + 1);
+ verify_mount_ids(_metadata, mnts, mnts2, num);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, fsmount)
+{
+ int ret, fs, mnt;
+ uint64_t mnts[2] = { self->root_id };
+
+ fs = fsopen("tmpfs", 0);
+ ASSERT_GE(fs, 0);
+
+ ret = fsconfig(fs, FSCONFIG_CMD_CREATE, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ mnt = fsmount(fs, 0, 0);
+ ASSERT_GE(mnt, 0);
+
+ close(fs);
+
+ ret = move_mount(mnt, "", AT_FDCWD, "/a", MOVE_MOUNT_F_EMPTY_PATH);
+ ASSERT_EQ(ret, 0);
+
+ close(mnt);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ ASSERT_NE(mnts[0], mnts[1]);
+
+ check_mounted(_metadata, mnts, 2);
+
+ // Cleanup
+ uint64_t detach_id;
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH);
+ ASSERT_EQ(detach_id, mnts[1]);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, reparent)
+{
+ uint64_t mnts[6] = { self->root_id };
+ uint64_t dmnts[3];
+ uint64_t masks[3];
+ unsigned int i;
+ int ret;
+
+ // Create setup with a[1] -> b[2] propagation
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/a", NULL, MS_SHARED, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("", "/b", NULL, MS_SLAVE, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ // Mount on a[3], which is propagated to b[4]
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 3);
+
+ check_mounted(_metadata, mnts, 5);
+
+ // Mount on b[5], not propagated
+ ret = mount("/", "/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[5] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ check_mounted(_metadata, mnts, 6);
+
+ // Umount a[3], which is propagated to b[4], but not b[5]
+ // This will result in b[5] "falling" on b[2]
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_n(_metadata, self, 3, masks, dmnts);
+ verify_mount_ids(_metadata, mnts + 3, dmnts, 3);
+
+ for (i = 0; i < 3; i++) {
+ if (dmnts[i] == mnts[5]) {
+ ASSERT_EQ(masks[i], FAN_MNT_ATTACH | FAN_MNT_DETACH);
+ } else {
+ ASSERT_EQ(masks[i], FAN_MNT_DETACH);
+ }
+ }
+
+ mnts[3] = mnts[5];
+ check_mounted(_metadata, mnts, 4);
+
+ // Cleanup
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/b");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 3, dmnts);
+ verify_mount_ids(_metadata, mnts + 1, dmnts, 3);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_F(fanotify, rmdir)
+{
+ uint64_t mnts[3] = { self->root_id };
+ int ret;
+
+ ret = mount("/", "/a", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/", "/a/b", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1);
+
+ check_mounted(_metadata, mnts, 3);
+
+ ret = chdir("/a");
+ ASSERT_EQ(ret, 0);
+
+ ret = fork();
+ ASSERT_GE(ret, 0);
+
+ if (ret == 0) {
+ chdir("/");
+ unshare(CLONE_NEWNS);
+ mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ umount2("/a", MNT_DETACH);
+ // This triggers a detach in the other namespace
+ rmdir("/a");
+ exit(0);
+ }
+ wait(NULL);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 2, mnts + 1);
+ check_mounted(_metadata, mnts, 1);
+
+ // Cleanup
+ ret = chdir("/");
+ ASSERT_EQ(ret, 0);
+}
+
+TEST_F(fanotify, pivot_root)
+{
+ uint64_t mnts[3] = { self->root_id };
+ uint64_t mnts2[3];
+ int ret;
+
+ ret = mount("tmpfs", "/a", "tmpfs", 0, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[2] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+
+ ret = mkdir("/a/new", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mkdir("/a/old", 0700);
+ ASSERT_EQ(ret, 0);
+
+ ret = mount("/a", "/a/new", NULL, MS_BIND, NULL);
+ ASSERT_EQ(ret, 0);
+
+ mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH);
+ check_mounted(_metadata, mnts, 3);
+
+ ret = syscall(SYS_pivot_root, "/a/new", "/a/new/old");
+ ASSERT_EQ(ret, 0);
+
+ expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH, 2, mnts2);
+ verify_mount_ids(_metadata, mnts, mnts2, 2);
+ check_mounted(_metadata, mnts, 3);
+
+ // Cleanup
+ ret = syscall(SYS_pivot_root, "/old", "/old/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a/new");
+ ASSERT_EQ(ret, 0);
+
+ ret = umount("/a");
+ ASSERT_EQ(ret, 0);
+
+ check_mounted(_metadata, mnts, 1);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
index 6c661232b3b5..d3ad4a77db9b 100644
--- a/tools/testing/selftests/filesystems/overlayfs/Makefile
+++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
@@ -4,7 +4,7 @@ CFLAGS += -Wall
CFLAGS += $(KHDR_INCLUDES)
LDLIBS += -lcap
-LOCAL_HDRS += wrappers.h log.h
+LOCAL_HDRS += ../wrappers.h log.h
TEST_GEN_PROGS := dev_in_maps
TEST_GEN_PROGS += set_layers_via_fds
diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
index 3b796264223f..31db54b00e64 100644
--- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -17,7 +17,7 @@
#include "../../kselftest.h"
#include "log.h"
-#include "wrappers.h"
+#include "../wrappers.h"
static long get_file_dev_and_inode(void *addr, struct statx *stx)
{
diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
index 5074e64e74a8..dc0449fa628f 100644
--- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -16,7 +16,7 @@
#include "../../pidfd/pidfd.h"
#include "log.h"
#include "../utils.h"
-#include "wrappers.h"
+#include "../wrappers.h"
FIXTURE(set_layers_via_fds) {
int pidfd;
diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile
index 14ee91a41650..8e354fe99b44 100644
--- a/tools/testing/selftests/filesystems/statmount/Makefile
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-or-later
-CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
+CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
+
TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test
include ../../lib.mk
+
+$(OUTPUT)/statmount_test_ns: ../utils.c
diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h
index a7a5289ddae9..99e5ad082fb1 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount.h
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@@ -7,6 +7,42 @@
#include <linux/mount.h>
#include <asm/unistd.h>
+#ifndef __NR_statmount
+ #if defined __alpha__
+ #define __NR_statmount 567
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_statmount 4457
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_statmount 6457
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_statmount 5457
+ #endif
+ #else
+ #define __NR_statmount 457
+ #endif
+#endif
+
+#ifndef __NR_listmount
+ #if defined __alpha__
+ #define __NR_listmount 568
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listmount 4458
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listmount 6458
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listmount 5458
+ #endif
+ #else
+ #define __NR_listmount 458
+ #endif
+#endif
+
static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
struct statmount *buf, size_t bufsize,
unsigned int flags)
diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
index 70cb0c8b21cf..605a3fa16bf7 100644
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@@ -14,6 +14,7 @@
#include <linux/stat.h>
#include "statmount.h"
+#include "../utils.h"
#include "../../kselftest.h"
#define NSID_PASS 0
@@ -78,87 +79,10 @@ static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id)
return NSID_PASS;
}
-static int get_mnt_id(const char *path, uint64_t *mnt_id)
-{
- struct statx sx;
- int ret;
-
- ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx);
- if (ret == -1) {
- ksft_print_msg("retrieving unique mount ID for %s: %s\n", path,
- strerror(errno));
- return NSID_ERROR;
- }
-
- if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) {
- ksft_print_msg("no unique mount ID available for %s\n", path);
- return NSID_ERROR;
- }
-
- *mnt_id = sx.stx_mnt_id;
- return NSID_PASS;
-}
-
-static int write_file(const char *path, const char *val)
-{
- int fd = open(path, O_WRONLY);
- size_t len = strlen(val);
- int ret;
-
- if (fd == -1) {
- ksft_print_msg("opening %s for write: %s\n", path, strerror(errno));
- return NSID_ERROR;
- }
-
- ret = write(fd, val, len);
- if (ret == -1) {
- ksft_print_msg("writing to %s: %s\n", path, strerror(errno));
- return NSID_ERROR;
- }
- if (ret != len) {
- ksft_print_msg("short write to %s\n", path);
- return NSID_ERROR;
- }
-
- ret = close(fd);
- if (ret == -1) {
- ksft_print_msg("closing %s\n", path);
- return NSID_ERROR;
- }
-
- return NSID_PASS;
-}
-
static int setup_namespace(void)
{
- int ret;
- char buf[32];
- uid_t uid = getuid();
- gid_t gid = getgid();
-
- ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
- if (ret == -1)
- ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
- strerror(errno));
-
- sprintf(buf, "0 %d 1", uid);
- ret = write_file("/proc/self/uid_map", buf);
- if (ret != NSID_PASS)
- return ret;
- ret = write_file("/proc/self/setgroups", "deny");
- if (ret != NSID_PASS)
- return ret;
- sprintf(buf, "0 %d 1", gid);
- ret = write_file("/proc/self/gid_map", buf);
- if (ret != NSID_PASS)
- return ret;
-
- ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
- if (ret == -1) {
- ksft_print_msg("making mount tree private: %s\n",
- strerror(errno));
+ if (setup_userns() != 0)
return NSID_ERROR;
- }
return NSID_PASS;
}
@@ -174,9 +98,9 @@ static int _test_statmount_mnt_ns_id(void)
if (ret != NSID_PASS)
return ret;
- ret = get_mnt_id("/", &root_id);
- if (ret != NSID_PASS)
- return ret;
+ root_id = get_unique_mnt_id("/");
+ if (!root_id)
+ return NSID_ERROR;
ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
if (ret == -1) {
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index e553c89c5b19..c43a69dffd83 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -18,7 +18,10 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/xattr.h>
+#include <sys/mount.h>
+#include "../kselftest.h"
+#include "wrappers.h"
#include "utils.h"
#define MAX_USERNS_LEVEL 32
@@ -447,6 +450,71 @@ out_close:
return fret;
}
+static int write_file(const char *path, const char *val)
+{
+ int fd = open(path, O_WRONLY);
+ size_t len = strlen(val);
+ int ret;
+
+ if (fd == -1) {
+ ksft_print_msg("opening %s for write: %s\n", path, strerror(errno));
+ return -1;
+ }
+
+ ret = write(fd, val, len);
+ if (ret == -1) {
+ ksft_print_msg("writing to %s: %s\n", path, strerror(errno));
+ return -1;
+ }
+ if (ret != len) {
+ ksft_print_msg("short write to %s\n", path);
+ return -1;
+ }
+
+ ret = close(fd);
+ if (ret == -1) {
+ ksft_print_msg("closing %s\n", path);
+ return -1;
+ }
+
+ return 0;
+}
+
+int setup_userns(void)
+{
+ int ret;
+ char buf[32];
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ if (ret) {
+ ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
+ strerror(errno));
+ return ret;
+ }
+
+ sprintf(buf, "0 %d 1", uid);
+ ret = write_file("/proc/self/uid_map", buf);
+ if (ret)
+ return ret;
+ ret = write_file("/proc/self/setgroups", "deny");
+ if (ret)
+ return ret;
+ sprintf(buf, "0 %d 1", gid);
+ ret = write_file("/proc/self/gid_map", buf);
+ if (ret)
+ return ret;
+
+ ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL);
+ if (ret) {
+ ksft_print_msg("making mount tree private: %s\n", strerror(errno));
+ return ret;
+ }
+
+ return 0;
+}
+
/* caps_down - lower all effective caps */
int caps_down(void)
{
@@ -499,3 +567,23 @@ out:
cap_free(caps);
return fret;
}
+
+uint64_t get_unique_mnt_id(const char *path)
+{
+ struct statx sx;
+ int ret;
+
+ ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx);
+ if (ret == -1) {
+ ksft_print_msg("retrieving unique mount ID for %s: %s\n", path,
+ strerror(errno));
+ return 0;
+ }
+
+ if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) {
+ ksft_print_msg("no unique mount ID available for %s\n", path);
+ return 0;
+ }
+
+ return sx.stx_mnt_id;
+}
diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h
index 7f1df2a3e94c..70f7ccc607f4 100644
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@@ -27,6 +27,7 @@ extern int caps_down(void);
extern int cap_down(cap_value_t down);
extern bool switch_ids(uid_t uid, gid_t gid);
+extern int setup_userns(void);
static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
{
@@ -42,4 +43,6 @@ static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
return true;
}
+extern uint64_t get_unique_mnt_id(const char *path);
+
#endif /* __IDMAP_UTILS_H */
diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/wrappers.h
index c38bc48e0cfa..420ae4f908cf 100644
--- a/tools/testing/selftests/filesystems/overlayfs/wrappers.h
+++ b/tools/testing/selftests/filesystems/wrappers.h
@@ -9,6 +9,10 @@
#include <linux/mount.h>
#include <sys/syscall.h>
+#ifndef STATX_MNT_ID_UNIQUE
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
+#endif
+
static inline int sys_fsopen(const char *fsname, unsigned int flags)
{
return syscall(__NR_fsopen, fsname, flags);
@@ -36,6 +40,28 @@ static inline int sys_mount(const char *src, const char *tgt, const char *fst,
#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
#endif
+#ifndef MOVE_MOUNT_T_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
+#endif
+
+#ifndef __NR_move_mount
+ #if defined __alpha__
+ #define __NR_move_mount 539
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_move_mount 4429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_move_mount 6429
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_move_mount 5429
+ #endif
+ #else
+ #define __NR_move_mount 429
+ #endif
+#endif
+
static inline int sys_move_mount(int from_dfd, const char *from_pathname,
int to_dfd, const char *to_pathname,
unsigned int flags)
@@ -53,7 +79,25 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname,
#endif
#ifndef AT_RECURSIVE
-#define AT_RECURSIVE 0x8000
+#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
+#endif
+
+#ifndef __NR_open_tree
+ #if defined __alpha__
+ #define __NR_open_tree 538
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_open_tree 4428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_open_tree 6428
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_open_tree 5428
+ #endif
+ #else
+ #define __NR_open_tree 428
+ #endif
#endif
static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
diff --git a/tools/testing/selftests/mount_setattr/Makefile b/tools/testing/selftests/mount_setattr/Makefile
index 0c0d7b1234c1..4d4f810cdf2c 100644
--- a/tools/testing/selftests/mount_setattr/Makefile
+++ b/tools/testing/selftests/mount_setattr/Makefile
@@ -2,6 +2,8 @@
# Makefile for mount selftests.
CFLAGS = -g $(KHDR_INCLUDES) -Wall -O2 -pthread
+LOCAL_HDRS += ../filesystems/wrappers.h
+
TEST_GEN_PROGS := mount_setattr_test
include ../lib.mk
diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
index 48a000cabc97..8b378c91debf 100644
--- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
+++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
@@ -20,7 +20,7 @@
#include <stdarg.h>
#include <linux/mount.h>
-#include "../filesystems/overlayfs/wrappers.h"
+#include "../filesystems/wrappers.h"
#include "../kselftest_harness.h"
#ifndef CLONE_NEWNS
@@ -107,46 +107,6 @@
#endif
#endif
-#ifndef __NR_open_tree
- #if defined __alpha__
- #define __NR_open_tree 538
- #elif defined _MIPS_SIM
- #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
- #define __NR_open_tree 4428
- #endif
- #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
- #define __NR_open_tree 6428
- #endif
- #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
- #define __NR_open_tree 5428
- #endif
- #elif defined __ia64__
- #define __NR_open_tree (428 + 1024)
- #else
- #define __NR_open_tree 428
- #endif
-#endif
-
-#ifndef __NR_move_mount
- #if defined __alpha__
- #define __NR_move_mount 539
- #elif defined _MIPS_SIM
- #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
- #define __NR_move_mount 4429
- #endif
- #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
- #define __NR_move_mount 6429
- #endif
- #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
- #define __NR_move_mount 5429
- #endif
- #elif defined __ia64__
- #define __NR_move_mount (428 + 1024)
- #else
- #define __NR_move_mount 429
- #endif
-#endif
-
#ifndef MOUNT_ATTR_IDMAP
#define MOUNT_ATTR_IDMAP 0x00100000
#endif
@@ -161,23 +121,6 @@ static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flag
return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
}
-#ifndef OPEN_TREE_CLONE
-#define OPEN_TREE_CLONE 1
-#endif
-
-#ifndef OPEN_TREE_CLOEXEC
-#define OPEN_TREE_CLOEXEC O_CLOEXEC
-#endif
-
-#ifndef AT_RECURSIVE
-#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
-#endif
-
-static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
-{
- return syscall(__NR_open_tree, dfd, filename, flags);
-}
-
static ssize_t write_nointr(int fd, const void *buf, size_t count)
{
ssize_t ret;
@@ -1076,7 +1019,7 @@ FIXTURE_SETUP(mount_setattr_idmapped)
ASSERT_EQ(mkdir("/mnt/D", 0777), 0);
img_fd = openat(-EBADF, "/mnt/C/ext4.img", O_CREAT | O_WRONLY, 0600);
ASSERT_GE(img_fd, 0);
- ASSERT_EQ(ftruncate(img_fd, 1024 * 2048), 0);
+ ASSERT_EQ(ftruncate(img_fd, 2147483648 /* 2 GB */), 0);
ASSERT_EQ(system("mkfs.ext4 -q /mnt/C/ext4.img"), 0);
ASSERT_EQ(system("mount -o loop -t ext4 /mnt/C/ext4.img /mnt/D/"), 0);
ASSERT_EQ(close(img_fd), 0);
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index 55bcf81a2b9a..efd74063126e 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -131,6 +131,26 @@
#define PIDFD_INFO_EXIT (1UL << 3) /* Always returned if available, even if not requested */
#endif
+#ifndef PIDFD_INFO_COREDUMP
+#define PIDFD_INFO_COREDUMP (1UL << 4)
+#endif
+
+#ifndef PIDFD_COREDUMPED
+#define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */
+#endif
+
+#ifndef PIDFD_COREDUMP_SKIP
+#define PIDFD_COREDUMP_SKIP (1U << 1) /* coredumping generation was skipped. */
+#endif
+
+#ifndef PIDFD_COREDUMP_USER
+#define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */
+#endif
+
+#ifndef PIDFD_COREDUMP_ROOT
+#define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */
+#endif
+
#ifndef PIDFD_THREAD
#define PIDFD_THREAD O_EXCL
#endif
@@ -150,6 +170,8 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
+ __u32 coredump_mask;
+ __u32 __spare1;
};
/*
diff --git a/tools/testing/selftests/pidfd/pidfd_bind_mount.c b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
index 7822dd080258..c094aeb1c620 100644
--- a/tools/testing/selftests/pidfd/pidfd_bind_mount.c
+++ b/tools/testing/selftests/pidfd/pidfd_bind_mount.c
@@ -15,79 +15,7 @@
#include "pidfd.h"
#include "../kselftest_harness.h"
-
-#ifndef __NR_open_tree
- #if defined __alpha__
- #define __NR_open_tree 538
- #elif defined _MIPS_SIM
- #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
- #define __NR_open_tree 4428
- #endif
- #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
- #define __NR_open_tree 6428
- #endif
- #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
- #define __NR_open_tree 5428
- #endif
- #elif defined __ia64__
- #define __NR_open_tree (428 + 1024)
- #else
- #define __NR_open_tree 428
- #endif
-#endif
-
-#ifndef __NR_move_mount
- #if defined __alpha__
- #define __NR_move_mount 539
- #elif defined _MIPS_SIM
- #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
- #define __NR_move_mount 4429
- #endif
- #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
- #define __NR_move_mount 6429
- #endif
- #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
- #define __NR_move_mount 5429
- #endif
- #elif defined __ia64__
- #define __NR_move_mount (428 + 1024)
- #else
- #define __NR_move_mount 429
- #endif
-#endif
-
-#ifndef MOVE_MOUNT_F_EMPTY_PATH
-#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
-#endif
-
-#ifndef MOVE_MOUNT_F_EMPTY_PATH
-#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */
-#endif
-
-static inline int sys_move_mount(int from_dfd, const char *from_pathname,
- int to_dfd, const char *to_pathname,
- unsigned int flags)
-{
- return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
- to_pathname, flags);
-}
-
-#ifndef OPEN_TREE_CLONE
-#define OPEN_TREE_CLONE 1
-#endif
-
-#ifndef OPEN_TREE_CLOEXEC
-#define OPEN_TREE_CLOEXEC O_CLOEXEC
-#endif
-
-#ifndef AT_RECURSIVE
-#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */
-#endif
-
-static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
-{
- return syscall(__NR_open_tree, dfd, filename, flags);
-}
+#include "../filesystems/wrappers.h"
FIXTURE(pidfd_bind_mount) {
char template[PATH_MAX];
diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c
index 1758a1b0457b..a0eb6e81eaa2 100644
--- a/tools/testing/selftests/pidfd/pidfd_info_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_info_test.c
@@ -299,6 +299,7 @@ TEST_F(pidfd_info, thread_group)
/* Opening a thread as a thread-group leader must fail. */
pidfd_thread = sys_pidfd_open(pid_thread, 0);
ASSERT_LT(pidfd_thread, 0);
+ ASSERT_EQ(errno, ENOENT);
/* Opening a thread as a PIDFD_THREAD must succeed. */
pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD);
@@ -362,9 +363,9 @@ TEST_F(pidfd_info, thread_group)
ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0);
ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
- /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */
- ASSERT_TRUE(WIFEXITED(info.exit_code));
- ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
+ /* Even though the thread-group exited successfully it will still report the group exit code. */
+ ASSERT_TRUE(WIFSIGNALED(info.exit_code));
+ ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL);
/*
* Retrieve exit information for the thread-group leader via the
@@ -375,9 +376,9 @@ TEST_F(pidfd_info, thread_group)
ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_CREDS));
ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_EXIT));
- /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */
- ASSERT_TRUE(WIFEXITED(info2.exit_code));
- ASSERT_EQ(WEXITSTATUS(info2.exit_code), 0);
+ /* Even though the thread-group exited successfully it will still report the group exit code. */
+ ASSERT_TRUE(WIFSIGNALED(info2.exit_code));
+ ASSERT_EQ(WTERMSIG(info2.exit_code), SIGKILL);
/* Retrieve exit information for the thread. */
info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT;
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index a951c0d33cd2..ddc97ecd8b39 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -573,5 +573,32 @@
"teardown": [
"$TC qdisc del dev $DEV1 handle 1: root"
]
+ },
+ {
+ "id": "831d",
+ "name": "Test HFSC qlen accounting with DRR/NETEM/BLACKHOLE chain",
+ "category": ["qdisc", "hfsc", "drr", "netem", "blackhole"],
+ "plugins": { "requires": ["nsPlugin", "scapyPlugin"] },
+ "setup": [
+ "$IP link set dev $DEV1 up || true",
+ "$TC qdisc add dev $DEV1 root handle 1: drr",
+ "$TC filter add dev $DEV1 parent 1: basic classid 1:1",
+ "$TC class add dev $DEV1 parent 1: classid 1:1 drr",
+ "$TC qdisc add dev $DEV1 parent 1:1 handle 2: hfsc def 1",
+ "$TC class add dev $DEV1 parent 2: classid 2:1 hfsc rt m1 8 d 1 m2 0",
+ "$TC qdisc add dev $DEV1 parent 2:1 handle 3: netem",
+ "$TC qdisc add dev $DEV1 parent 3:1 handle 4: blackhole"
+ ],
+ "scapy": {
+ "iface": "$DEV0",
+ "count": 5,
+ "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()"
+ },
+ "cmdUnderTest": "$TC -s qdisc show dev $DEV1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s qdisc show dev $DEV1",
+ "matchPattern": "qdisc hfsc",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 root handle 1: drr"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json
index e9469ee71e6f..6d515d0e5ed6 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/codel.json
@@ -189,5 +189,29 @@
"teardown": [
"$TC qdisc del dev $DUMMY handle 1: root"
]
+ },
+ {
+ "id": "deb1",
+ "name": "CODEL test qdisc limit trimming",
+ "category": ["qdisc", "codel"],
+ "plugins": {
+ "requires": ["nsPlugin", "scapyPlugin"]
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root codel limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root codel limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc codel 1: root refcnt [0-9]+ limit 1p target 5ms interval 100ms",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json
index 3a537b2ec4c9..24faf4e12dfa 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq.json
@@ -377,5 +377,27 @@
"teardown": [
"$TC qdisc del dev $DUMMY handle 1: root"
]
+ },
+ {
+ "id": "9479",
+ "name": "FQ test qdisc limit trimming",
+ "category": ["qdisc", "fq"],
+ "plugins": {"requires": ["nsPlugin", "scapyPlugin"]},
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root fq limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc fq 1: root refcnt [0-9]+ limit 1p",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json
index 9774b1e8801b..4ce62b857fd7 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_codel.json
@@ -294,5 +294,27 @@
"teardown": [
"$TC qdisc del dev $DUMMY handle 1: root"
]
+ },
+ {
+ "id": "0436",
+ "name": "FQ_CODEL test qdisc limit trimming",
+ "category": ["qdisc", "fq_codel"],
+ "plugins": {"requires": ["nsPlugin", "scapyPlugin"]},
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root fq_codel limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq_codel limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc fq_codel 1: root refcnt [0-9]+ limit 1p flows 1024 quantum.*target 5ms interval 100ms memory_limit 32Mb ecn drop_batch 64",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
index d012d88d67fe..229fe1bf4a90 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
@@ -18,5 +18,27 @@
"matchCount": "1",
"teardown": [
]
+ },
+ {
+ "id": "83bf",
+ "name": "FQ_PIE test qdisc limit trimming",
+ "category": ["qdisc", "fq_pie"],
+ "plugins": {"requires": ["nsPlugin", "scapyPlugin"]},
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root fq_pie limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root fq_pie limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc fq_pie 1: root refcnt [0-9]+ limit 1p",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json
index dbef5474b26b..0ca19fac54a5 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/hhf.json
@@ -188,5 +188,27 @@
"teardown": [
"$TC qdisc del dev $DUMMY handle 1: root"
]
+ },
+ {
+ "id": "385f",
+ "name": "HHF test qdisc limit trimming",
+ "category": ["qdisc", "hhf"],
+ "plugins": {"requires": ["nsPlugin", "scapyPlugin"]},
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root hhf limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root hhf limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc hhf 1: root refcnt [0-9]+ limit 1p.*hh_limit 2048 reset_timeout 40ms admit_bytes 128Kb evict_timeout 1s non_hh_weight 2",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
}
]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json
new file mode 100644
index 000000000000..1a98b66e8030
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/pie.json
@@ -0,0 +1,24 @@
+[
+ {
+ "id": "6158",
+ "name": "PIE test qdisc limit trimming",
+ "category": ["qdisc", "pie"],
+ "plugins": {"requires": ["nsPlugin", "scapyPlugin"]},
+ "setup": [
+ "$TC qdisc add dev $DEV1 handle 1: root pie limit 10"
+ ],
+ "scapy": [
+ {
+ "iface": "$DEV0",
+ "count": 10,
+ "packet": "Ether(type=0x800)/IP(src='10.0.0.10',dst='10.0.0.20')/TCP(sport=5000,dport=10)"
+ }
+ ],
+ "cmdUnderTest": "$TC qdisc change dev $DEV1 handle 1: root pie limit 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DEV1",
+ "matchPattern": "qdisc pie 1: root refcnt [0-9]+ limit 1p",
+ "matchCount": "1",
+ "teardown": ["$TC qdisc del dev $DEV1 handle 1: root"]
+ }
+]
diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index f34ac0bac696..4dde8838261d 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir)
+CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir)/usr/include
+ifneq ($(WERROR),0)
+ CFLAGS += -Werror
+endif
+
LDLIBS += -lpthread -lm -luring
TEST_PROGS := test_generic_01.sh
@@ -11,6 +15,11 @@ TEST_PROGS += test_generic_05.sh
TEST_PROGS += test_generic_06.sh
TEST_PROGS += test_generic_07.sh
+TEST_PROGS += test_generic_08.sh
+TEST_PROGS += test_generic_09.sh
+TEST_PROGS += test_generic_10.sh
+TEST_PROGS += test_generic_11.sh
+
TEST_PROGS += test_null_01.sh
TEST_PROGS += test_null_02.sh
TEST_PROGS += test_loop_01.sh
diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c
index 94a8e729ba4c..5421774d7867 100644
--- a/tools/testing/selftests/ublk/fault_inject.c
+++ b/tools/testing/selftests/ublk/fault_inject.c
@@ -16,6 +16,11 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
const struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
unsigned long dev_size = 250UL << 30;
+ if (ctx->auto_zc_fallback) {
+ ublk_err("%s: not support auto_zc_fallback\n", __func__);
+ return -EINVAL;
+ }
+
dev->tgt.dev_size = dev_size;
dev->tgt.params = (struct ublk_params) {
.types = UBLK_PARAM_TYPE_BASIC,
diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c
index 6f34eabfae97..509842df9bee 100644
--- a/tools/testing/selftests/ublk/file_backed.c
+++ b/tools/testing/selftests/ublk/file_backed.c
@@ -29,19 +29,23 @@ static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_des
static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag)
{
unsigned ublk_op = ublksrv_get_op(iod);
- int zc = ublk_queue_use_zc(q);
- enum io_uring_op op = ublk_to_uring_op(iod, zc);
+ unsigned zc = ublk_queue_use_zc(q);
+ unsigned auto_zc = ublk_queue_use_auto_zc(q);
+ enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc);
struct io_uring_sqe *sqe[3];
+ void *addr = (zc | auto_zc) ? NULL : (void *)iod->addr;
- if (!zc) {
+ if (!zc || auto_zc) {
ublk_queue_alloc_sqes(q, sqe, 1);
if (!sqe[0])
return -ENOMEM;
io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/,
- (void *)iod->addr,
+ addr,
iod->nr_sectors << 9,
iod->start_sector << 9);
+ if (auto_zc)
+ sqe[0]->buf_index = tag;
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
/* bit63 marks us as tgt io */
sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1);
@@ -145,6 +149,11 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
},
};
+ if (ctx->auto_zc_fallback) {
+ ublk_err("%s: not support auto_zc_fallback\n", __func__);
+ return -EINVAL;
+ }
+
ret = backing_file_tgt_init(dev);
if (ret)
return ret;
diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c
index 842b40736a9b..b5131a000795 100644
--- a/tools/testing/selftests/ublk/kublk.c
+++ b/tools/testing/selftests/ublk/kublk.c
@@ -216,6 +216,30 @@ static int ublk_ctrl_get_features(struct ublk_dev *dev,
return __ublk_ctrl_cmd(dev, &data);
}
+static int ublk_ctrl_update_size(struct ublk_dev *dev,
+ __u64 nr_sects)
+{
+ struct ublk_ctrl_cmd_data data = {
+ .cmd_op = UBLK_U_CMD_UPDATE_SIZE,
+ .flags = CTRL_CMD_HAS_DATA,
+ };
+
+ data.data[0] = nr_sects;
+ return __ublk_ctrl_cmd(dev, &data);
+}
+
+static int ublk_ctrl_quiesce_dev(struct ublk_dev *dev,
+ unsigned int timeout_ms)
+{
+ struct ublk_ctrl_cmd_data data = {
+ .cmd_op = UBLK_U_CMD_QUIESCE_DEV,
+ .flags = CTRL_CMD_HAS_DATA,
+ };
+
+ data.data[0] = timeout_ms;
+ return __ublk_ctrl_cmd(dev, &data);
+}
+
static const char *ublk_dev_state_desc(struct ublk_dev *dev)
{
switch (dev->dev_info.state) {
@@ -405,7 +429,7 @@ static void ublk_queue_deinit(struct ublk_queue *q)
free(q->ios[i].buf_addr);
}
-static int ublk_queue_init(struct ublk_queue *q)
+static int ublk_queue_init(struct ublk_queue *q, unsigned extra_flags)
{
struct ublk_dev *dev = q->dev;
int depth = dev->dev_info.queue_depth;
@@ -420,10 +444,14 @@ static int ublk_queue_init(struct ublk_queue *q)
q->cmd_inflight = 0;
q->tid = gettid();
- if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
q->state |= UBLKSRV_NO_BUF;
- q->state |= UBLKSRV_ZC;
+ if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY)
+ q->state |= UBLKSRV_ZC;
+ if (dev->dev_info.flags & UBLK_F_AUTO_BUF_REG)
+ q->state |= UBLKSRV_AUTO_BUF_REG;
}
+ q->state |= extra_flags;
cmd_buf_size = ublk_queue_cmd_buf_sz(q);
off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz();
@@ -461,7 +489,7 @@ static int ublk_queue_init(struct ublk_queue *q)
goto fail;
}
- if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) {
+ if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) {
ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth);
if (ret) {
ublk_err("ublk dev %d queue %d register spare buffers failed %d",
@@ -525,6 +553,23 @@ static void ublk_dev_unprep(struct ublk_dev *dev)
close(dev->fds[0]);
}
+static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
+ struct io_uring_sqe *sqe,
+ unsigned short tag)
+{
+ struct ublk_auto_buf_reg buf = {};
+
+ if (q->tgt_ops->buf_index)
+ buf.index = q->tgt_ops->buf_index(q, tag);
+ else
+ buf.index = tag;
+
+ if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
+ buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
+
+ sqe->addr = ublk_auto_buf_reg_to_sqe_addr(&buf);
+}
+
int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
{
struct ublksrv_io_cmd *cmd;
@@ -579,6 +624,9 @@ int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag)
else
cmd->addr = 0;
+ if (q->state & UBLKSRV_AUTO_BUF_REG)
+ ublk_set_auto_buf_reg(q, sqe[0], tag);
+
user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0);
io_uring_sqe_set_data64(sqe[0], user_data);
@@ -729,6 +777,7 @@ struct ublk_queue_info {
struct ublk_queue *q;
sem_t *queue_sem;
cpu_set_t *affinity;
+ unsigned char auto_zc_fallback;
};
static void *ublk_io_handler_fn(void *data)
@@ -736,9 +785,13 @@ static void *ublk_io_handler_fn(void *data)
struct ublk_queue_info *info = data;
struct ublk_queue *q = info->q;
int dev_id = q->dev->dev_info.dev_id;
+ unsigned extra_flags = 0;
int ret;
- ret = ublk_queue_init(q);
+ if (info->auto_zc_fallback)
+ extra_flags = UBLKSRV_AUTO_BUF_REG_FALLBACK;
+
+ ret = ublk_queue_init(q, extra_flags);
if (ret) {
ublk_err("ublk dev %d queue %d init queue failed\n",
dev_id, q->q_id);
@@ -831,6 +884,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
qinfo[i].q = &dev->q[i];
qinfo[i].queue_sem = &queue_sem;
qinfo[i].affinity = &affinity_buf[i];
+ qinfo[i].auto_zc_fallback = ctx->auto_zc_fallback;
pthread_create(&dev->q[i].thread, NULL,
ublk_io_handler_fn,
&qinfo[i]);
@@ -1011,6 +1065,9 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
info->nr_hw_queues = nr_queues;
info->queue_depth = depth;
info->flags = ctx->flags;
+ if ((features & UBLK_F_QUIESCE) &&
+ (info->flags & UBLK_F_USER_RECOVERY))
+ info->flags |= UBLK_F_QUIESCE;
dev->tgt.ops = ops;
dev->tgt.sq_depth = depth;
dev->tgt.cq_depth = depth;
@@ -1206,6 +1263,9 @@ static int cmd_dev_get_features(void)
[const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY",
[const_ilog2(UBLK_F_ZONED)] = "ZONED",
[const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO",
+ [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE",
+ [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG",
+ [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE",
};
struct ublk_dev *dev;
__u64 features = 0;
@@ -1239,13 +1299,66 @@ static int cmd_dev_get_features(void)
return ret;
}
+static int cmd_dev_update_size(struct dev_ctx *ctx)
+{
+ struct ublk_dev *dev = ublk_ctrl_init();
+ struct ublk_params p;
+ int ret = -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ctx->dev_id < 0) {
+ fprintf(stderr, "device id isn't provided\n");
+ goto out;
+ }
+
+ dev->dev_info.dev_id = ctx->dev_id;
+ ret = ublk_ctrl_get_params(dev, &p);
+ if (ret < 0) {
+ ublk_err("failed to get params %d %s\n", ret, strerror(-ret));
+ goto out;
+ }
+
+ if (ctx->size & ((1 << p.basic.logical_bs_shift) - 1)) {
+ ublk_err("size isn't aligned with logical block size\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ublk_ctrl_update_size(dev, ctx->size >> 9);
+out:
+ ublk_ctrl_deinit(dev);
+ return ret;
+}
+
+static int cmd_dev_quiesce(struct dev_ctx *ctx)
+{
+ struct ublk_dev *dev = ublk_ctrl_init();
+ int ret = -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ctx->dev_id < 0) {
+ fprintf(stderr, "device id isn't provided for quiesce\n");
+ goto out;
+ }
+ dev->dev_info.dev_id = ctx->dev_id;
+ ret = ublk_ctrl_quiesce_dev(dev, 10000);
+
+out:
+ ublk_ctrl_deinit(dev);
+ return ret;
+}
+
static void __cmd_create_help(char *exe, bool recovery)
{
int i;
printf("%s %s -t [null|loop|stripe|fault_inject] [-q nr_queues] [-d depth] [-n dev_id]\n",
exe, recovery ? "recover" : "add");
- printf("\t[--foreground] [--quiet] [-z] [--debug_mask mask] [-r 0|1 ] [-g]\n");
+ printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n");
printf("\t[-e 0|1 ] [-i 0|1]\n");
printf("\t[target options] [backfile1] [backfile2] ...\n");
printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
@@ -1281,6 +1394,8 @@ static int cmd_dev_help(char *exe)
printf("%s list [-n dev_id] -a \n", exe);
printf("\t -a list all devices, -n list specified device, default -a \n\n");
printf("%s features\n", exe);
+ printf("%s update_size -n dev_id -s|--size size_in_bytes \n", exe);
+ printf("%s quiesce -n dev_id\n", exe);
return 0;
}
@@ -1300,6 +1415,9 @@ int main(int argc, char *argv[])
{ "recovery_fail_io", 1, NULL, 'e'},
{ "recovery_reissue", 1, NULL, 'i'},
{ "get_data", 1, NULL, 'g'},
+ { "auto_zc", 0, NULL, 0 },
+ { "auto_zc_fallback", 0, NULL, 0 },
+ { "size", 1, NULL, 's'},
{ 0, 0, 0, 0 }
};
const struct ublk_tgt_ops *ops = NULL;
@@ -1321,7 +1439,7 @@ int main(int argc, char *argv[])
opterr = 0;
optind = 2;
- while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:gaz",
+ while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gaz",
longopts, &option_idx)) != -1) {
switch (opt) {
case 'a':
@@ -1361,6 +1479,9 @@ int main(int argc, char *argv[])
case 'g':
ctx.flags |= UBLK_F_NEED_GET_DATA;
break;
+ case 's':
+ ctx.size = strtoull(optarg, NULL, 10);
+ break;
case 0:
if (!strcmp(longopts[option_idx].name, "debug_mask"))
ublk_dbg_mask = strtol(optarg, NULL, 16);
@@ -1368,6 +1489,10 @@ int main(int argc, char *argv[])
ublk_dbg_mask = 0;
if (!strcmp(longopts[option_idx].name, "foreground"))
ctx.fg = 1;
+ if (!strcmp(longopts[option_idx].name, "auto_zc"))
+ ctx.flags |= UBLK_F_AUTO_BUF_REG;
+ if (!strcmp(longopts[option_idx].name, "auto_zc_fallback"))
+ ctx.auto_zc_fallback = 1;
break;
case '?':
/*
@@ -1391,6 +1516,16 @@ int main(int argc, char *argv[])
}
}
+ /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
+ if (ctx.auto_zc_fallback &&
+ !((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
+ (ctx.flags & UBLK_F_SUPPORT_ZERO_COPY))) {
+ ublk_err("%s: auto_zc_fallback is set but neither "
+ "F_AUTO_BUF_REG nor F_SUPPORT_ZERO_COPY is enabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
i = optind;
while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
ctx.files[ctx.nr_files++] = argv[i++];
@@ -1423,6 +1558,10 @@ int main(int argc, char *argv[])
ret = cmd_dev_help(argv[0]);
else if (!strcmp(cmd, "features"))
ret = cmd_dev_get_features();
+ else if (!strcmp(cmd, "update_size"))
+ ret = cmd_dev_update_size(&ctx);
+ else if (!strcmp(cmd, "quiesce"))
+ ret = cmd_dev_quiesce(&ctx);
else
cmd_dev_help(argv[0]);
diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h
index 44ee1e4ac55b..e34508bf5798 100644
--- a/tools/testing/selftests/ublk/kublk.h
+++ b/tools/testing/selftests/ublk/kublk.h
@@ -19,7 +19,6 @@
#include <sys/inotify.h>
#include <sys/wait.h>
#include <sys/eventfd.h>
-#include <sys/uio.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <linux/io_uring.h>
@@ -85,6 +84,7 @@ struct dev_ctx {
unsigned int all:1;
unsigned int fg:1;
unsigned int recovery:1;
+ unsigned int auto_zc_fallback:1;
int _evtfd;
int _shmid;
@@ -92,6 +92,9 @@ struct dev_ctx {
/* built from shmem, only for ublk_dump_dev() */
struct ublk_dev *shadow_dev;
+ /* for 'update_size' command */
+ unsigned long long size;
+
union {
struct stripe_ctx stripe;
struct fault_inject_ctx fault_inject;
@@ -116,6 +119,7 @@ struct ublk_io {
#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1)
#define UBLKSRV_IO_FREE (1UL << 2)
#define UBLKSRV_NEED_GET_DATA (1UL << 3)
+#define UBLKSRV_NEED_REG_BUF (1UL << 4)
unsigned short flags;
unsigned short refs; /* used by target code only */
@@ -141,6 +145,9 @@ struct ublk_tgt_ops {
*/
void (*parse_cmd_line)(struct dev_ctx *ctx, int argc, char *argv[]);
void (*usage)(const struct ublk_tgt_ops *ops);
+
+ /* return buffer index for UBLK_F_AUTO_BUF_REG */
+ unsigned short (*buf_index)(const struct ublk_queue *, int tag);
};
struct ublk_tgt {
@@ -169,6 +176,8 @@ struct ublk_queue {
#define UBLKSRV_QUEUE_IDLE (1U << 1)
#define UBLKSRV_NO_BUF (1U << 2)
#define UBLKSRV_ZC (1U << 3)
+#define UBLKSRV_AUTO_BUF_REG (1U << 4)
+#define UBLKSRV_AUTO_BUF_REG_FALLBACK (1U << 5)
unsigned state;
pid_t tid;
pthread_t thread;
@@ -204,6 +213,12 @@ struct ublk_dev {
extern unsigned int ublk_dbg_mask;
extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag);
+
+static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
+{
+ return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF);
+}
+
static inline int is_target_io(__u64 user_data)
{
return (user_data & (1ULL << 63)) != 0;
@@ -388,6 +403,11 @@ static inline int ublk_queue_use_zc(const struct ublk_queue *q)
return q->state & UBLKSRV_ZC;
}
+static inline int ublk_queue_use_auto_zc(const struct ublk_queue *q)
+{
+ return q->state & UBLKSRV_AUTO_BUF_REG;
+}
+
extern const struct ublk_tgt_ops null_tgt_ops;
extern const struct ublk_tgt_ops loop_tgt_ops;
extern const struct ublk_tgt_ops stripe_tgt_ops;
diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c
index 91fec3690d4b..44aca31cf2b0 100644
--- a/tools/testing/selftests/ublk/null.c
+++ b/tools/testing/selftests/ublk/null.c
@@ -42,10 +42,22 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
return 0;
}
+static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
+ struct io_uring_sqe *sqe)
+{
+ unsigned ublk_op = ublksrv_get_op(iod);
+
+ io_uring_prep_nop(sqe);
+ sqe->buf_index = tag;
+ sqe->flags |= IOSQE_FIXED_FILE;
+ sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
+ sqe->len = iod->nr_sectors << 9; /* injected result */
+ sqe->user_data = build_user_data(tag, ublk_op, 0, 1);
+}
+
static int null_queue_zc_io(struct ublk_queue *q, int tag)
{
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
- unsigned ublk_op = ublksrv_get_op(iod);
struct io_uring_sqe *sqe[3];
ublk_queue_alloc_sqes(q, sqe, 3);
@@ -55,12 +67,8 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1);
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
- io_uring_prep_nop(sqe[1]);
- sqe[1]->buf_index = tag;
- sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
- sqe[1]->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
- sqe[1]->len = iod->nr_sectors << 9; /* injected result */
- sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1);
+ __setup_nop_io(tag, iod, sqe[1]);
+ sqe[1]->flags |= IOSQE_IO_HARDLINK;
io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag);
sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1);
@@ -69,6 +77,16 @@ static int null_queue_zc_io(struct ublk_queue *q, int tag)
return 2;
}
+static int null_queue_auto_zc_io(struct ublk_queue *q, int tag)
+{
+ const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
+ struct io_uring_sqe *sqe[1];
+
+ ublk_queue_alloc_sqes(q, sqe, 1);
+ __setup_nop_io(tag, iod, sqe[0]);
+ return 1;
+}
+
static void ublk_null_io_done(struct ublk_queue *q, int tag,
const struct io_uring_cqe *cqe)
{
@@ -94,22 +112,37 @@ static void ublk_null_io_done(struct ublk_queue *q, int tag,
static int ublk_null_queue_io(struct ublk_queue *q, int tag)
{
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
- int zc = ublk_queue_use_zc(q);
+ unsigned auto_zc = ublk_queue_use_auto_zc(q);
+ unsigned zc = ublk_queue_use_zc(q);
int queued;
- if (!zc) {
+ if (auto_zc && !ublk_io_auto_zc_fallback(iod))
+ queued = null_queue_auto_zc_io(q, tag);
+ else if (zc)
+ queued = null_queue_zc_io(q, tag);
+ else {
ublk_complete_io(q, tag, iod->nr_sectors << 9);
return 0;
}
-
- queued = null_queue_zc_io(q, tag);
ublk_queued_tgt_io(q, tag, queued);
return 0;
}
+/*
+ * return invalid buffer index for triggering auto buffer register failure,
+ * then UBLK_IO_RES_NEED_REG_BUF handling is covered
+ */
+static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
+{
+ if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK)
+ return (unsigned short)-1;
+ return tag;
+}
+
const struct ublk_tgt_ops null_tgt_ops = {
.name = "null",
.init_tgt = ublk_null_tgt_init,
.queue_io = ublk_null_queue_io,
.tgt_io_done = ublk_null_io_done,
+ .buf_index = ublk_null_buf_index,
};
diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c
index 5dbd6392d83d..404a143bf3d6 100644
--- a/tools/testing/selftests/ublk/stripe.c
+++ b/tools/testing/selftests/ublk/stripe.c
@@ -70,7 +70,7 @@ static void free_stripe_array(struct stripe_array *s)
}
static void calculate_stripe_array(const struct stripe_conf *conf,
- const struct ublksrv_io_desc *iod, struct stripe_array *s)
+ const struct ublksrv_io_desc *iod, struct stripe_array *s, void *base)
{
const unsigned shift = conf->shift - 9;
const unsigned chunk_sects = 1 << shift;
@@ -102,7 +102,7 @@ static void calculate_stripe_array(const struct stripe_conf *conf,
}
assert(this->nr_vec < this->cap);
- this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done);
+ this->vec[this->nr_vec].iov_base = (void *)(base + done);
this->vec[this->nr_vec++].iov_len = nr_sects << 9;
start += nr_sects;
@@ -126,15 +126,17 @@ static inline enum io_uring_op stripe_to_uring_op(
static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag)
{
const struct stripe_conf *conf = get_chunk_shift(q);
- int zc = !!(ublk_queue_use_zc(q) != 0);
- enum io_uring_op op = stripe_to_uring_op(iod, zc);
+ unsigned auto_zc = (ublk_queue_use_auto_zc(q) != 0);
+ unsigned zc = (ublk_queue_use_zc(q) != 0);
+ enum io_uring_op op = stripe_to_uring_op(iod, zc | auto_zc);
struct io_uring_sqe *sqe[NR_STRIPE];
struct stripe_array *s = alloc_stripe_array(conf, iod);
struct ublk_io *io = ublk_get_io(q, tag);
int i, extra = zc ? 2 : 0;
+ void *base = (zc | auto_zc) ? NULL : (void *)iod->addr;
io->private_data = s;
- calculate_stripe_array(conf, iod, s);
+ calculate_stripe_array(conf, iod, s, base);
ublk_queue_alloc_sqes(q, sqe, s->nr + extra);
@@ -153,12 +155,11 @@ static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_
(void *)t->vec,
t->nr_vec,
t->start << 9);
- if (zc) {
+ io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
+ if (auto_zc || zc) {
sqe[i]->buf_index = tag;
- io_uring_sqe_set_flags(sqe[i],
- IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK);
- } else {
- io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE);
+ if (zc)
+ sqe[i]->flags |= IOSQE_IO_HARDLINK;
}
/* bit63 marks us as tgt io */
sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i - zc, 1);
@@ -287,6 +288,11 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
loff_t bytes = 0;
int ret, i, mul = 1;
+ if (ctx->auto_zc_fallback) {
+ ublk_err("%s: not support auto_zc_fallback\n", __func__);
+ return -EINVAL;
+ }
+
if ((chunk_size & (chunk_size - 1)) || !chunk_size) {
ublk_err("invalid chunk size %u\n", chunk_size);
return -EINVAL;
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index a81210ca3e99..0145569ee7e9 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -23,6 +23,11 @@ _get_disk_dev_t() {
echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) ))
}
+_get_disk_size()
+{
+ lsblk -b -o SIZE -n "$1"
+}
+
_run_fio_verify_io() {
fio --name=verify --rw=randwrite --direct=1 --ioengine=libaio \
--bs=8k --iodepth=32 --verify=crc32c --do_verify=1 \
@@ -215,6 +220,26 @@ _recover_ublk_dev() {
echo "$state"
}
+# quiesce device and return ublk device state
+__ublk_quiesce_dev()
+{
+ local dev_id=$1
+ local exp_state=$2
+ local state
+
+ if ! ${UBLK_PROG} quiesce -n "${dev_id}"; then
+ state=$(_get_ublk_dev_state "${dev_id}")
+ return "$state"
+ fi
+
+ for ((j=0;j<50;j++)); do
+ state=$(_get_ublk_dev_state "${dev_id}")
+ [ "$state" == "$exp_state" ] && break
+ sleep 1
+ done
+ echo "$state"
+}
+
# kill the ublk daemon and return ublk device state
__ublk_kill_daemon()
{
@@ -251,7 +276,7 @@ __run_io_and_remove()
local kill_server=$3
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
- --rw=readwrite --iodepth=256 --size="${size}" --numjobs=4 \
+ --rw=randrw --norandommap --iodepth=256 --size="${size}" --numjobs="$(nproc)" \
--runtime=20 --time_based > /dev/null 2>&1 &
sleep 2
if [ "${kill_server}" = "yes" ]; then
@@ -303,20 +328,26 @@ run_io_and_kill_daemon()
run_io_and_recover()
{
+ local action=$1
local state
local dev_id
+ shift 1
dev_id=$(_add_ublk_dev "$@")
_check_add_dev "$TID" $?
fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \
- --rw=readwrite --iodepth=256 --size="${size}" --numjobs=4 \
+ --rw=randread --iodepth=256 --size="${size}" --numjobs=4 \
--runtime=20 --time_based > /dev/null 2>&1 &
sleep 4
- state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
+ if [ "$action" == "kill_daemon" ]; then
+ state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
+ elif [ "$action" == "quiesce_dev" ]; then
+ state=$(__ublk_quiesce_dev "${dev_id}" "QUIESCED")
+ fi
if [ "$state" != "QUIESCED" ]; then
- echo "device isn't quiesced($state) after killing daemon"
+ echo "device isn't quiesced($state) after $action"
return 255
fi
diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh
index 8a3bc080c577..8b533217d4a1 100755
--- a/tools/testing/selftests/ublk/test_generic_04.sh
+++ b/tools/testing/selftests/ublk/test_generic_04.sh
@@ -8,7 +8,7 @@ ERR_CODE=0
ublk_run_recover_test()
{
- run_io_and_recover "$@"
+ run_io_and_recover "kill_daemon" "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh
index 3bb00a347402..398e9e2b58e1 100755
--- a/tools/testing/selftests/ublk/test_generic_05.sh
+++ b/tools/testing/selftests/ublk/test_generic_05.sh
@@ -8,7 +8,7 @@ ERR_CODE=0
ublk_run_recover_test()
{
- run_io_and_recover "$@"
+ run_io_and_recover "kill_daemon" "$@"
ERR_CODE=$?
if [ ${ERR_CODE} -ne 0 ]; then
echo "$TID failure: $*"
diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh
index b67230c42c84..fd42062b7b76 100755
--- a/tools/testing/selftests/ublk/test_generic_06.sh
+++ b/tools/testing/selftests/ublk/test_generic_06.sh
@@ -17,7 +17,7 @@ STARTTIME=${SECONDS}
dd if=/dev/urandom of=/dev/ublkb${dev_id} oflag=direct bs=4k count=1 status=none > /dev/null 2>&1 &
dd_pid=$!
-__ublk_kill_daemon ${dev_id} "DEAD"
+__ublk_kill_daemon ${dev_id} "DEAD" >/dev/null
wait $dd_pid
dd_exitcode=$?
diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh
new file mode 100755
index 000000000000..b222f3a77e12
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_08.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_08"
+ERR_CODE=0
+
+if ! _have_feature "AUTO_BUF_REG"; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "generic" "test UBLK_F_AUTO_BUF_REG"
+
+_create_backfile 0 256M
+_create_backfile 1 256M
+
+dev_id=$(_add_ublk_dev -t loop -q 2 --auto_zc "${UBLK_BACKFILES[0]}")
+_check_add_dev $TID $?
+
+if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then
+ _cleanup_test "generic"
+ _show_result $TID 255
+fi
+
+dev_id=$(_add_ublk_dev -t stripe --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}")
+_check_add_dev $TID $?
+_mkfs_mount_test /dev/ublkb"${dev_id}"
+ERR_CODE=$?
+
+_cleanup_test "generic"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh
new file mode 100755
index 000000000000..bb6f77ca5522
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_09.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_09"
+ERR_CODE=0
+
+if ! _have_feature "AUTO_BUF_REG"; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "basic IO test"
+
+dev_id=$(_add_ublk_dev -t null -z --auto_zc --auto_zc_fallback)
+_check_add_dev $TID $?
+
+# run fio over the two disks
+fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1
+ERR_CODE=$?
+
+_cleanup_test "null"
+
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh
new file mode 100755
index 000000000000..abc11c3d416b
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_10.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_10"
+ERR_CODE=0
+
+if ! _have_feature "UPDATE_SIZE"; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "null" "check update size"
+
+dev_id=$(_add_ublk_dev -t null)
+_check_add_dev $TID $?
+
+size=$(_get_disk_size /dev/ublkb"${dev_id}")
+size=$(( size / 2 ))
+if ! "$UBLK_PROG" update_size -n "$dev_id" -s "$size"; then
+ ERR_CODE=255
+fi
+
+new_size=$(_get_disk_size /dev/ublkb"${dev_id}")
+if [ "$new_size" != "$size" ]; then
+ ERR_CODE=255
+fi
+
+_cleanup_test "null"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh
new file mode 100755
index 000000000000..a00357a5ec6b
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_11.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_11"
+ERR_CODE=0
+
+ublk_run_quiesce_recover()
+{
+ run_io_and_recover "quiesce_dev" "$@"
+ ERR_CODE=$?
+ if [ ${ERR_CODE} -ne 0 ]; then
+ echo "$TID failure: $*"
+ _show_result $TID $ERR_CODE
+ fi
+}
+
+if ! _have_feature "QUIESCE"; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+if ! _have_program fio; then
+ exit "$UBLK_SKIP_CODE"
+fi
+
+_prep_test "quiesce" "basic quiesce & recover function verification"
+
+_create_backfile 0 256M
+_create_backfile 1 128M
+_create_backfile 2 128M
+
+ublk_run_quiesce_recover -t null -q 2 -r 1 &
+ublk_run_quiesce_recover -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_quiesce_recover -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+ublk_run_quiesce_recover -t null -q 2 -r 1 -i 1 &
+ublk_run_quiesce_recover -t loop -q 2 -r 1 -i 1 "${UBLK_BACKFILES[0]}" &
+ublk_run_quiesce_recover -t stripe -q 2 -r 1 -i 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+wait
+
+_cleanup_test "quiesce"
+_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh
index 1a9065125ae1..4bdd921081e5 100755
--- a/tools/testing/selftests/ublk/test_stress_02.sh
+++ b/tools/testing/selftests/ublk/test_stress_02.sh
@@ -25,10 +25,12 @@ _create_backfile 0 256M
_create_backfile 1 128M
_create_backfile 2 128M
-ublk_io_and_kill_daemon 8G -t null -q 4 &
-ublk_io_and_kill_daemon 256M -t loop -q 4 "${UBLK_BACKFILES[0]}" &
-ublk_io_and_kill_daemon 256M -t stripe -q 4 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
-wait
+for nr_queue in 1 4; do
+ ublk_io_and_kill_daemon 8G -t null -q "$nr_queue" &
+ ublk_io_and_kill_daemon 256M -t loop -q "$nr_queue" "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_kill_daemon 256M -t stripe -q "$nr_queue" "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ wait
+done
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh
index e0854f71d35b..7d728ce50774 100755
--- a/tools/testing/selftests/ublk/test_stress_03.sh
+++ b/tools/testing/selftests/ublk/test_stress_03.sh
@@ -32,6 +32,13 @@ _create_backfile 2 128M
ublk_io_and_remove 8G -t null -q 4 -z &
ublk_io_and_remove 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_remove 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+
+if _have_feature "AUTO_BUF_REG"; then
+ ublk_io_and_remove 8G -t null -q 4 --auto_zc &
+ ublk_io_and_remove 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_remove 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
+fi
wait
_cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh
index 1798a98387e8..9bcfa64ea1f0 100755
--- a/tools/testing/selftests/ublk/test_stress_04.sh
+++ b/tools/testing/selftests/ublk/test_stress_04.sh
@@ -31,6 +31,13 @@ _create_backfile 2 128M
ublk_io_and_kill_daemon 8G -t null -q 4 -z &
ublk_io_and_kill_daemon 256M -t loop -q 4 -z "${UBLK_BACKFILES[0]}" &
ublk_io_and_kill_daemon 256M -t stripe -q 4 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+
+if _have_feature "AUTO_BUF_REG"; then
+ ublk_io_and_kill_daemon 8G -t null -q 4 --auto_zc &
+ ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc "${UBLK_BACKFILES[0]}" &
+ ublk_io_and_kill_daemon 256M -t stripe -q 4 --auto_zc "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" &
+ ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback &
+fi
wait
_cleanup_test "stress"
diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh
index 88601b48f1cd..bcfc904cefc6 100755
--- a/tools/testing/selftests/ublk/test_stress_05.sh
+++ b/tools/testing/selftests/ublk/test_stress_05.sh
@@ -60,5 +60,14 @@ if _have_feature "ZERO_COPY"; then
done
fi
+if _have_feature "AUTO_BUF_REG"; then
+ for reissue in $(seq 0 1); do
+ ublk_io_and_remove 8G -t null -q 4 -g --auto_zc -r 1 -i "$reissue" &
+ ublk_io_and_remove 256M -t loop -q 4 -g --auto_zc -r 1 -i "$reissue" "${UBLK_BACKFILES[1]}" &
+ ublk_io_and_remove 8G -t null -q 4 -g -z --auto_zc --auto_zc_fallback -r 1 -i "$reissue" &
+ wait
+ done
+fi
+
_cleanup_test "stress"
_show_result $TID $ERR_CODE
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index d0f6d253ac72..613551132a96 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -1264,21 +1264,25 @@ static void test_unsent_bytes_client(const struct test_opts *opts, int type)
send_buf(fd, buf, sizeof(buf), 0, sizeof(buf));
control_expectln("RECEIVED");
- ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
- if (ret < 0) {
- if (errno == EOPNOTSUPP) {
- fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n");
- } else {
+ /* SIOCOUTQ isn't guaranteed to instantly track sent data. Even though
+ * the "RECEIVED" message means that the other side has received the
+ * data, there can be a delay in our kernel before updating the "unsent
+ * bytes" counter. Repeat SIOCOUTQ until it returns 0.
+ */
+ timeout_begin(TIMEOUT);
+ do {
+ ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent);
+ if (ret < 0) {
+ if (errno == EOPNOTSUPP) {
+ fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n");
+ break;
+ }
perror("ioctl");
exit(EXIT_FAILURE);
}
- } else if (ret == 0 && sock_bytes_unsent != 0) {
- fprintf(stderr,
- "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n",
- sock_bytes_unsent);
- exit(EXIT_FAILURE);
- }
-
+ timeout_check("SIOCOUTQ");
+ } while (sock_bytes_unsent != 0);
+ timeout_end();
close(fd);
}